Commit d16dc20c authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] sys_remap_file_pages

Ingo's remap_file_pages patch.  Supported on ia32, x86-64, sparc
and sparc64.  Others will need to update mman.h and the syscall
tables.
parent f9a316fa
......@@ -740,6 +740,7 @@ ENTRY(sys_call_table)
.long sys_epoll_create
.long sys_epoll_ctl /* 255 */
.long sys_epoll_wait
.long sys_remap_file_pages
.rept NR_syscalls-(.-sys_call_table)/4
......
......@@ -18,6 +18,8 @@
#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
#define MAP_LOCKED 0x2000 /* pages are locked */
#define MAP_NORESERVE 0x4000 /* don't check for reservations */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_INVALIDATE 2 /* invalidate the caches */
......
......@@ -261,7 +261,8 @@
#define __NR_sys_epoll_create 254
#define __NR_sys_epoll_ctl 255
#define __NR_sys_epoll_wait 256
#define __NR_remap_file_pages 257
/* user-visible error numbers are in the range -1 - -124: see <asm-i386/errno.h> */
......
......@@ -32,6 +32,9 @@
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
/* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system
* XXX calls.
*/
......
......@@ -32,6 +32,9 @@
#define MCL_CURRENT 0x2000 /* lock all currently mapped pages */
#define MCL_FUTURE 0x4000 /* lock all additions to address space */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
/* XXX Need to add flags to SunOS's mctl, mlockall, and madvise system
* XXX calls.
*/
......
......@@ -19,6 +19,8 @@
#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
#define MAP_LOCKED 0x2000 /* pages are locked */
#define MAP_NORESERVE 0x4000 /* don't check for reservations */
#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
#define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MS_ASYNC 1 /* sync memory asynchronously */
#define MS_INVALIDATE 2 /* invalidate the caches */
......
......@@ -130,6 +130,7 @@ struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
void (*close)(struct vm_area_struct * area);
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int unused);
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, unsigned long prot, unsigned long pgoff, int nonblock);
};
/* forward declaration; pte_chain is meant to be internal to rmap.c */
......@@ -365,9 +366,12 @@ extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
extern pte_t *FASTCALL(pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
extern int install_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, struct page *page, unsigned long prot);
extern int handle_mm_fault(struct mm_struct *mm,struct vm_area_struct *vma, unsigned long address, int write_access);
extern int make_pages_present(unsigned long addr, unsigned long end);
extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
extern int sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long nonblock);
extern struct page * follow_page(struct mm_struct *mm, unsigned long address, int write);
int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
......
......@@ -4,7 +4,7 @@
export-objs := shmem.o filemap.o mempool.o page_alloc.o page-writeback.o
obj-y := memory.o mmap.o filemap.o mprotect.o mlock.o mremap.o \
obj-y := memory.o mmap.o filemap.o fremap.o mprotect.o mlock.o mremap.o \
vmalloc.o slab.o bootmem.o swap.o vmscan.o page_io.o \
page_alloc.o swap_state.o swapfile.o oom_kill.o \
shmem.o highmem.o mempool.o msync.o mincore.o readahead.o \
......
......@@ -1148,8 +1148,159 @@ struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address
return NULL;
}
static struct page * filemap_getpage(struct file *file, unsigned long pgoff,
int nonblock)
{
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
struct page *page;
int error;
/*
* Do we have something in the page cache already?
*/
retry_find:
page = find_get_page(mapping, pgoff);
if (!page) {
if (nonblock)
return NULL;
goto no_cached_page;
}
/*
* Ok, found a page in the page cache, now we need to check
* that it's up-to-date.
*/
if (!PageUptodate(page))
goto page_not_uptodate;
success:
/*
* Found the page and have a reference on it, need to check sharing
* and possibly copy it over to another page..
*/
mark_page_accessed(page);
flush_page_to_ram(page);
return page;
no_cached_page:
error = page_cache_read(file, pgoff);
/*
* The page we want has now been added to the page cache.
* In the unlikely event that someone removed it in the
* meantime, we'll just come back here and read it again.
*/
if (error >= 0)
goto retry_find;
/*
* An error return from page_cache_read can result if the
* system is low on memory, or a problem occurs while trying
* to schedule I/O.
*/
return NULL;
page_not_uptodate:
lock_page(page);
/* Did it get unhashed while we waited for it? */
if (!page->mapping) {
unlock_page(page);
goto err;
}
/* Did somebody else get it up-to-date? */
if (PageUptodate(page)) {
unlock_page(page);
goto success;
}
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
}
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
*/
lock_page(page);
/* Somebody truncated the page on us? */
if (!page->mapping) {
unlock_page(page);
goto err;
}
/* Somebody else successfully read it in? */
if (PageUptodate(page)) {
unlock_page(page);
goto success;
}
ClearPageError(page);
if (!mapping->a_ops->readpage(file, page)) {
wait_on_page_locked(page);
if (PageUptodate(page))
goto success;
}
/*
* Things didn't work out. Return zero to tell the
* mm layer so, possibly freeing the page cache page first.
*/
err:
page_cache_release(page);
return NULL;
}
static int filemap_populate(struct vm_area_struct *vma,
unsigned long addr,
unsigned long len,
unsigned long prot,
unsigned long pgoff,
int nonblock)
{
struct file *file = vma->vm_file;
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
struct inode *inode = mapping->host;
unsigned long size;
struct mm_struct *mm = vma->vm_mm;
struct page *page;
int err;
repeat:
size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (pgoff + (len >> PAGE_CACHE_SHIFT) > size)
return -EINVAL;
page = filemap_getpage(file, pgoff, nonblock);
if (!page && !nonblock)
return -ENOMEM;
if (page) {
err = install_page(mm, vma, addr, page, prot);
if (err) {
page_cache_release(page);
return err;
}
}
len -= PAGE_SIZE;
addr += PAGE_SIZE;
pgoff++;
if (len)
goto repeat;
return 0;
}
static struct vm_operations_struct generic_file_vm_ops = {
.nopage = filemap_nopage,
.populate = filemap_populate,
};
/* This is used for a general mmap of a disk file */
......
/*
* linux/mm/mpopulate.c
*
* Explicit pagetable population and nonlinear (random) mappings support.
*
* started by Ingo Molnar, Copyright (C) 2002
*/
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/mman.h>
#include <linux/pagemap.h>
#include <linux/swapops.h>
#include <asm/mmu_context.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
static inline void zap_pte(struct mm_struct *mm, pte_t *ptep)
{
pte_t pte = *ptep;
if (pte_none(pte))
return;
if (pte_present(pte)) {
unsigned long pfn = pte_pfn(pte);
pte = ptep_get_and_clear(ptep);
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!PageReserved(page)) {
if (pte_dirty(pte))
set_page_dirty(page);
page_remove_rmap(page, ptep);
page_cache_release(page);
mm->rss--;
}
}
} else {
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(ptep);
}
}
/*
* Install a page to a given virtual memory address, release any
* previously existing mapping.
*/
int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, struct page *page, unsigned long prot)
{
int err = -ENOMEM;
pte_t *pte, entry;
pgd_t *pgd;
pmd_t *pmd;
pgd = pgd_offset(mm, addr);
spin_lock(&mm->page_table_lock);
pmd = pmd_alloc(mm, pgd, addr);
if (!pmd)
goto err_unlock;
pte = pte_alloc_map(mm, pmd, addr);
if (!pte)
goto err_unlock;
zap_pte(mm, pte);
mm->rss++;
flush_page_to_ram(page);
flush_icache_page(vma, page);
entry = mk_pte(page, protection_map[prot]);
if (prot & PROT_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
set_pte(pte, entry);
page_add_rmap(page, pte);
pte_unmap(pte);
flush_tlb_page(vma, addr);
spin_unlock(&mm->page_table_lock);
return 0;
err_unlock:
spin_unlock(&mm->page_table_lock);
return err;
}
/***
* sys_remap_file_pages - remap arbitrary pages of a shared backing store
* file within an existing vma.
* @start: start of the remapped virtual memory range
* @size: size of the remapped virtual memory range
* @prot: new protection bits of the range
* @pgoff: to be mapped page of the backing store file
* @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
*
* this syscall works purely via pagetables, so it's the most efficient
* way to map the same (large) file into a given virtual window. Unlike
* mremap()/mmap() it does not create any new vmas.
*
* The new mappings do not live across swapout, so either use MAP_LOCKED
* or use PROT_NONE in the original linear mapping and add a special
* SIGBUS pagefault handler to reinstall zapped mappings.
*/
int sys_remap_file_pages(unsigned long start, unsigned long size,
unsigned long prot, unsigned long pgoff, unsigned long flags)
{
struct mm_struct *mm = current->mm;
unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
/*
* Sanitize the syscall parameters:
*/
start = PAGE_ALIGN(start);
size = PAGE_ALIGN(size);
prot &= 0xf;
down_read(&mm->mmap_sem);
vma = find_vma(mm, start);
/*
* Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within
* the single existing vma:
*/
if (vma && (vma->vm_flags & VM_SHARED) &&
vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start &&
end <= vma->vm_end) {
/*
* Change the default protection to PROT_NONE:
*/
if (pgprot_val(vma->vm_page_prot) != pgprot_val(__S000))
vma->vm_page_prot = __S000;
err = vma->vm_ops->populate(vma, start, size, prot,
pgoff, flags & MAP_NONBLOCK);
}
up_read(&mm->mmap_sem);
return err;
}
......@@ -608,6 +608,12 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
mm->locked_vm += len >> PAGE_SHIFT;
make_pages_present(addr, addr + len);
}
if (flags & MAP_POPULATE) {
up_write(&mm->mmap_sem);
sys_remap_file_pages(addr, len, prot,
pgoff, flags & MAP_NONBLOCK);
down_write(&mm->mmap_sem);
}
return addr;
unmap_and_free_vma:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment