Commit 968f11a8 authored by Jamie Lokier's avatar Jamie Lokier Committed by Linus Torvalds

[PATCH] Unpinned futexes v2: indexing changes

This changes the way futexes are indexed, so that they don't pin pages. 
It also fixes some bugs with private mappings and COW pages.

Currently, all futexes look up the page at the userspace address and pin
it, using the pair (page,offset) as an index into a table of waiting
futexes.  Any page with a futex waiting on it remains pinned in RAM,
which is a problem when many futexes are used, especially with FUTEX_FD.

Another problem is that the page is not always the correct one, if it
can be changed later by a COW (copy on write) operation.  This can
happen when waiting on a futex without writing to it after fork(),
exec() or mmap(), if the page is then written to before attempting to
wake a futex at the same adress. 

There are two symptoms of the COW problem:
 - The wrong process can receive wakeups
 - A process can fail to receive required wakeups. 

This patch fixes both by changing the indexing so that VM_SHARED
mappings use the triple (inode,offset,index), and private mappings use
the pair (mm,virtual_address).

The former correctly handles all shared mappings, including tmpfs and
therefore all kinds of shared memory (IPC shm, /dev/shm and
MAP_ANON|MAP_SHARED).  This works because every mapping which is
VM_SHARED has an associated non-zero vma->vm_file, and hence inode.
(This is ensured in do_mmap_pgoff, where it calls shmem_zero_setup). 

The latter handles all private mappings, both files and anonymous.  It
isn't affected by COW, because it doesn't care about the actual pages,
just the virtual address.

The patch has a few bonuses:

        1. It removes the vcache implementation, as only futexes were
           using it, and they don't any more.

        2. Removing the vcache should make COW page faults a bit faster.

        3. Futex operations no longer take the page table lock, walk
           the page table, fault in pages that aren't mapped in the
           page table, or do a vcache hash lookup - they are mostly a
           simple offset calculation with one hash for the futex
           table.  So they should be noticably faster.

Special thanks to Hugh Dickins, Andrew Morton and Rusty Russell for
insightful feedback.  All suggestions are included.
parent 707c584e
...@@ -110,6 +110,7 @@ struct vm_area_struct { ...@@ -110,6 +110,7 @@ struct vm_area_struct {
#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
......
/*
* virtual => physical mapping cache support.
*/
#ifndef _LINUX_VCACHE_H
#define _LINUX_VCACHE_H
typedef struct vcache_s {
unsigned long address;
struct mm_struct *mm;
struct list_head hash_entry;
void (*callback)(struct vcache_s *data, struct page *new_page);
} vcache_t;
extern spinlock_t vcache_lock;
extern void __attach_vcache(vcache_t *vcache,
unsigned long address,
struct mm_struct *mm,
void (*callback)(struct vcache_s *data, struct page *new_page));
extern void __detach_vcache(vcache_t *vcache);
extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
struct page *new_page);
#endif
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
* Generalized futexes, futex requeueing, misc fixes by Ingo Molnar * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
* (C) Copyright 2003 Red Hat Inc, All Rights Reserved * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
* *
* Removed page pinning, fix privately mapped COW pages and other cleanups
* (C) Copyright 2003 Jamie Lokier
*
* Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
* enough at me, Linus for the original (flawed) idea, Matthew * enough at me, Linus for the original (flawed) idea, Matthew
* Kirkwood for proof-of-concept implementation. * Kirkwood for proof-of-concept implementation.
...@@ -33,11 +36,31 @@ ...@@ -33,11 +36,31 @@
#include <linux/hash.h> #include <linux/hash.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/futex.h> #include <linux/futex.h>
#include <linux/vcache.h>
#include <linux/mount.h> #include <linux/mount.h>
#include <linux/pagemap.h>
#define FUTEX_HASHBITS 8 #define FUTEX_HASHBITS 8
/*
* Futexes are matched on equal values of this key.
* The key type depends on whether it's a shared or private mapping.
*/
union futex_key {
struct {
unsigned long pgoff;
struct inode *inode;
} shared;
struct {
unsigned long uaddr;
struct mm_struct *mm;
} private;
struct {
unsigned long word;
void *ptr;
} both;
int offset;
};
/* /*
* We use this hashed waitqueue instead of a normal wait_queue_t, so * We use this hashed waitqueue instead of a normal wait_queue_t, so
* we can wake only the relevant ones (hashed queues may be shared): * we can wake only the relevant ones (hashed queues may be shared):
...@@ -46,12 +69,8 @@ struct futex_q { ...@@ -46,12 +69,8 @@ struct futex_q {
struct list_head list; struct list_head list;
wait_queue_head_t waiters; wait_queue_head_t waiters;
/* Page struct and offset within it. */ /* Key which the futex is hashed on. */
struct page *page; union futex_key key;
int offset;
/* the virtual => physical COW-safe cache */
vcache_t vcache;
/* For fd, sigio sent using these. */ /* For fd, sigio sent using these. */
int fd; int fd;
...@@ -66,111 +85,149 @@ static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED; ...@@ -66,111 +85,149 @@ static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
static struct vfsmount *futex_mnt; static struct vfsmount *futex_mnt;
/* /*
* These are all locks that are necessery to look up a physical * We hash on the keys returned from get_futex_key (see below).
* mapping safely, and modify/search the futex hash, atomically:
*/ */
static inline void lock_futex_mm(void) static inline struct list_head *hash_futex(union futex_key *key)
{ {
spin_lock(&current->mm->page_table_lock); return &futex_queues[hash_long(key->both.word
spin_lock(&vcache_lock); + (unsigned long) key->both.ptr
spin_lock(&futex_lock); + key->offset, FUTEX_HASHBITS)];
}
static inline void unlock_futex_mm(void)
{
spin_unlock(&futex_lock);
spin_unlock(&vcache_lock);
spin_unlock(&current->mm->page_table_lock);
} }
/* /*
* The physical page is shared, so we can hash on its address: * Return 1 if two futex_keys are equal, 0 otherwise.
*/ */
static inline struct list_head *hash_futex(struct page *page, int offset) static inline int match_futex(union futex_key *key1, union futex_key *key2)
{ {
return &futex_queues[hash_long((unsigned long)page + offset, return (key1->both.word == key2->both.word
FUTEX_HASHBITS)]; && key1->both.ptr == key2->both.ptr
&& key1->offset == key2->offset);
} }
/* /*
* Get kernel address of the user page and pin it. * Get parameters which are the keys for a futex.
*
* For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
* *
* Must be called with (and returns with) all futex-MM locks held. * Returns: 0, or negative error code.
* The key words are stored in *key on success.
*
* Should be called with &current->mm->mmap_sem,
* but NOT &futex_lock or &current->mm->page_table_lock.
*/ */
static inline struct page *__pin_page_atomic (struct page *page) static int get_futex_key(unsigned long uaddr, union futex_key *key)
{
if (!PageReserved(page))
get_page(page);
return page;
}
static struct page *__pin_page(unsigned long addr)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct page *page, *tmp; struct vm_area_struct *vma;
struct page *page;
int err; int err;
/* /*
* Do a quick atomic lookup first - this is the fastpath. * The futex address must be "naturally" aligned.
*/ */
page = follow_page(mm, addr, 0); key->offset = uaddr % PAGE_SIZE;
if (likely(page != NULL)) if (unlikely((key->offset % sizeof(u32)) != 0))
return __pin_page_atomic(page); return -EINVAL;
uaddr -= key->offset;
/* /*
* No luck - need to fault in the page: * The futex is hashed differently depending on whether
* it's in a shared or private mapping. So check vma first.
*/ */
repeat_lookup: vma = find_extend_vma(mm, uaddr);
if (unlikely(!vma))
return -EFAULT;
unlock_futex_mm(); /*
* Permissions.
*/
if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
down_read(&mm->mmap_sem); /*
err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL); * Private mappings are handled in a simple way.
up_read(&mm->mmap_sem); *
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process. Therefore we use
* VM_MAYSHARE here, not VM_SHARED which is restricted to shared
* mappings of _writable_ handles.
*/
if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
key->private.mm = mm;
key->private.uaddr = uaddr;
return 0;
}
lock_futex_mm(); /*
* Linear mappings are also simple.
*/
key->shared.inode = vma->vm_file->f_dentry->d_inode;
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+ vma->vm_pgoff);
return 0;
}
if (err < 0)
return NULL;
/* /*
* Since the faulting happened with locks released, we have to * We could walk the page table to read the non-linear
* check for races: * pte, and get the page index without fetching the page
* from swap. But that's a lot of code to duplicate here
* for a rare case, so we simply fetch the page.
*/ */
tmp = follow_page(mm, addr, 0);
if (tmp != page) { /*
put_page(page); * Do a quick atomic lookup first - this is the fastpath.
goto repeat_lookup; */
spin_lock(&current->mm->page_table_lock);
page = follow_page(mm, uaddr, 0);
if (likely(page != NULL)) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
spin_unlock(&current->mm->page_table_lock);
return 0;
} }
spin_unlock(&current->mm->page_table_lock);
return page; /*
* Do it the general way.
*/
err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
if (err >= 0) {
key->shared.pgoff =
page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
put_page(page);
}
return err;
} }
/* /*
* Wake up all waiters hashed on the physical page that is mapped * Wake up all waiters hashed on the physical page that is mapped
* to this virtual address: * to this virtual address:
*/ */
static inline int futex_wake(unsigned long uaddr, int offset, int num) static inline int futex_wake(unsigned long uaddr, int num)
{ {
struct list_head *i, *next, *head; struct list_head *i, *next, *head;
struct page *page; union futex_key key;
int ret = 0; int ret;
lock_futex_mm(); down_read(&current->mm->mmap_sem);
page = __pin_page(uaddr - offset); ret = get_futex_key(uaddr, &key);
if (!page) { if (unlikely(ret != 0))
unlock_futex_mm(); goto out;
return -EFAULT;
}
head = hash_futex(page, offset); head = hash_futex(&key);
spin_lock(&futex_lock);
list_for_each_safe(i, next, head) { list_for_each_safe(i, next, head) {
struct futex_q *this = list_entry(i, struct futex_q, list); struct futex_q *this = list_entry(i, struct futex_q, list);
if (this->page == page && this->offset == offset) { if (match_futex (&this->key, &key)) {
list_del_init(i); list_del_init(i);
__detach_vcache(&this->vcache);
wake_up_all(&this->waiters); wake_up_all(&this->waiters);
if (this->filp) if (this->filp)
send_sigio(&this->filp->f_owner, this->fd, POLL_IN); send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
...@@ -179,113 +236,74 @@ static inline int futex_wake(unsigned long uaddr, int offset, int num) ...@@ -179,113 +236,74 @@ static inline int futex_wake(unsigned long uaddr, int offset, int num)
break; break;
} }
} }
spin_unlock(&futex_lock);
unlock_futex_mm(); out:
put_page(page); up_read(&current->mm->mmap_sem);
return ret; return ret;
} }
/*
* This gets called by the COW code, we have to rehash any
* futexes that were pending on the old physical page, and
* rehash it to the new physical page. The pagetable_lock
* and vcache_lock is already held:
*/
static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
{
struct futex_q *q = container_of(vcache, struct futex_q, vcache);
struct list_head *head = hash_futex(new_page, q->offset);
spin_lock(&futex_lock);
if (!list_empty(&q->list)) {
put_page(q->page);
q->page = new_page;
__pin_page_atomic(new_page);
list_del(&q->list);
list_add_tail(&q->list, head);
}
spin_unlock(&futex_lock);
}
/* /*
* Requeue all waiters hashed on one physical page to another * Requeue all waiters hashed on one physical page to another
* physical page. * physical page.
*/ */
static inline int futex_requeue(unsigned long uaddr1, int offset1, static inline int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
unsigned long uaddr2, int offset2, int nr_wake, int nr_requeue) int nr_wake, int nr_requeue)
{ {
struct list_head *i, *next, *head1, *head2; struct list_head *i, *next, *head1, *head2;
struct page *page1 = NULL, *page2 = NULL; union futex_key key1, key2;
int ret = 0; int ret;
lock_futex_mm(); down_read(&current->mm->mmap_sem);
page1 = __pin_page(uaddr1 - offset1); ret = get_futex_key(uaddr1, &key1);
if (!page1) if (unlikely(ret != 0))
goto out; goto out;
page2 = __pin_page(uaddr2 - offset2); ret = get_futex_key(uaddr2, &key2);
if (!page2) if (unlikely(ret != 0))
goto out; goto out;
head1 = hash_futex(page1, offset1); head1 = hash_futex(&key1);
head2 = hash_futex(page2, offset2); head2 = hash_futex(&key2);
spin_lock(&futex_lock);
list_for_each_safe(i, next, head1) { list_for_each_safe(i, next, head1) {
struct futex_q *this = list_entry(i, struct futex_q, list); struct futex_q *this = list_entry(i, struct futex_q, list);
if (this->page == page1 && this->offset == offset1) { if (match_futex (&this->key, &key1)) {
list_del_init(i); list_del_init(i);
__detach_vcache(&this->vcache);
if (++ret <= nr_wake) { if (++ret <= nr_wake) {
wake_up_all(&this->waiters); wake_up_all(&this->waiters);
if (this->filp) if (this->filp)
send_sigio(&this->filp->f_owner, send_sigio(&this->filp->f_owner,
this->fd, POLL_IN); this->fd, POLL_IN);
} else { } else {
put_page(this->page);
__pin_page_atomic (page2);
list_add_tail(i, head2); list_add_tail(i, head2);
__attach_vcache(&this->vcache, uaddr2, this->key = key2;
current->mm, futex_vcache_callback);
this->offset = offset2;
this->page = page2;
if (ret - nr_wake >= nr_requeue) if (ret - nr_wake >= nr_requeue)
break; break;
} }
} }
} }
spin_unlock(&futex_lock);
out: out:
unlock_futex_mm(); up_read(&current->mm->mmap_sem);
if (page1)
put_page(page1);
if (page2)
put_page(page2);
return ret; return ret;
} }
static inline void __queue_me(struct futex_q *q, struct page *page, static inline void queue_me(struct futex_q *q, union futex_key *key,
unsigned long uaddr, int offset,
int fd, struct file *filp) int fd, struct file *filp)
{ {
struct list_head *head = hash_futex(page, offset); struct list_head *head = hash_futex(key);
q->offset = offset; q->key = *key;
q->fd = fd; q->fd = fd;
q->filp = filp; q->filp = filp;
q->page = page;
spin_lock(&futex_lock);
list_add_tail(&q->list, head); list_add_tail(&q->list, head);
/* spin_unlock(&futex_lock);
* We register a futex callback to this virtual address,
* to make sure a COW properly rehashes the futex-queue.
*/
__attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
} }
/* Return 1 if we were still queued (ie. 0 means we were woken) */ /* Return 1 if we were still queued (ie. 0 means we were woken) */
...@@ -293,83 +311,107 @@ static inline int unqueue_me(struct futex_q *q) ...@@ -293,83 +311,107 @@ static inline int unqueue_me(struct futex_q *q)
{ {
int ret = 0; int ret = 0;
spin_lock(&vcache_lock);
spin_lock(&futex_lock); spin_lock(&futex_lock);
if (!list_empty(&q->list)) { if (!list_empty(&q->list)) {
list_del(&q->list); list_del(&q->list);
__detach_vcache(&q->vcache);
ret = 1; ret = 1;
} }
spin_unlock(&futex_lock); spin_unlock(&futex_lock);
spin_unlock(&vcache_lock);
return ret; return ret;
} }
static inline int futex_wait(unsigned long uaddr, static inline int futex_wait(unsigned long uaddr, int val, unsigned long time)
int offset,
int val,
unsigned long time)
{ {
DECLARE_WAITQUEUE(wait, current); DECLARE_WAITQUEUE(wait, current);
int ret = 0, curval; int ret, curval;
struct page *page; union futex_key key;
struct futex_q q; struct futex_q q;
try_again:
init_waitqueue_head(&q.waiters); init_waitqueue_head(&q.waiters);
lock_futex_mm(); down_read(&current->mm->mmap_sem);
page = __pin_page(uaddr - offset); ret = get_futex_key(uaddr, &key);
if (!page) { if (unlikely(ret != 0))
unlock_futex_mm(); goto out_release_sem;
return -EFAULT;
} queue_me(&q, &key, -1, NULL);
__queue_me(&q, page, uaddr, offset, -1, NULL);
/* /*
* Page is pinned, but may no longer be in this address space. * Access the page after the futex is queued.
* It cannot schedule, so we access it with the spinlock held. * We hold the mmap semaphore, so the mapping cannot have changed
* since we looked it up.
*/ */
if (get_user(curval, (int *)uaddr) != 0) { if (get_user(curval, (int *)uaddr) != 0) {
unlock_futex_mm();
ret = -EFAULT; ret = -EFAULT;
goto out; goto out_unqueue;
} }
if (curval != val) { if (curval != val) {
unlock_futex_mm();
ret = -EWOULDBLOCK; ret = -EWOULDBLOCK;
goto out; goto out_unqueue;
} }
/* /*
* The get_user() above might fault and schedule so we * Now the futex is queued and we have checked the data, we
* cannot just set TASK_INTERRUPTIBLE state when queueing * don't want to hold mmap_sem while we sleep.
* ourselves into the futex hash. This code thus has to */
up_read(&current->mm->mmap_sem);
/*
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
* faults. So we cannot just set TASK_INTERRUPTIBLE state when
* queueing ourselves into the futex hash. This code thus has to
* rely on the futex_wake() code doing a wakeup after removing * rely on the futex_wake() code doing a wakeup after removing
* the waiter from the list. * the waiter from the list.
*/ */
add_wait_queue(&q.waiters, &wait); add_wait_queue(&q.waiters, &wait);
spin_lock(&futex_lock);
set_current_state(TASK_INTERRUPTIBLE); set_current_state(TASK_INTERRUPTIBLE);
if (!list_empty(&q.list)) {
unlock_futex_mm(); if (unlikely(list_empty(&q.list))) {
time = schedule_timeout(time); /*
* We were woken already.
*/
spin_unlock(&futex_lock);
set_current_state(TASK_RUNNING);
return 0;
} }
spin_unlock(&futex_lock);
time = schedule_timeout(time);
set_current_state(TASK_RUNNING); set_current_state(TASK_RUNNING);
/* /*
* NOTE: we don't remove ourselves from the waitqueue because * NOTE: we don't remove ourselves from the waitqueue because
* we are the only user of it. * we are the only user of it.
*/ */
if (time == 0) {
ret = -ETIMEDOUT; /*
goto out; * Were we woken or interrupted for a valid reason?
} */
ret = unqueue_me(&q);
if (ret == 0)
return 0;
if (time == 0)
return -ETIMEDOUT;
if (signal_pending(current)) if (signal_pending(current))
ret = -EINTR; return -EINTR;
out:
/* Were we woken up anyway? */ /*
* No, it was a spurious wakeup. Try again. Should never happen. :)
*/
goto try_again;
out_unqueue:
/*
* Were we unqueued anyway?
*/
if (!unqueue_me(&q)) if (!unqueue_me(&q))
ret = 0; ret = 0;
put_page(q.page); out_release_sem:
up_read(&current->mm->mmap_sem);
return ret; return ret;
} }
...@@ -378,7 +420,6 @@ static int futex_close(struct inode *inode, struct file *filp) ...@@ -378,7 +420,6 @@ static int futex_close(struct inode *inode, struct file *filp)
struct futex_q *q = filp->private_data; struct futex_q *q = filp->private_data;
unqueue_me(q); unqueue_me(q);
put_page(q->page);
kfree(filp->private_data); kfree(filp->private_data);
return 0; return 0;
} }
...@@ -406,12 +447,12 @@ static struct file_operations futex_fops = { ...@@ -406,12 +447,12 @@ static struct file_operations futex_fops = {
/* Signal allows caller to avoid the race which would occur if they /* Signal allows caller to avoid the race which would occur if they
set the sigio stuff up afterwards. */ set the sigio stuff up afterwards. */
static int futex_fd(unsigned long uaddr, int offset, int signal) static int futex_fd(unsigned long uaddr, int signal)
{ {
struct page *page = NULL;
struct futex_q *q; struct futex_q *q;
union futex_key key;
struct file *filp; struct file *filp;
int ret; int ret, err;
ret = -EINVAL; ret = -EINVAL;
if (signal < 0 || signal > _NSIG) if (signal < 0 || signal > _NSIG)
...@@ -450,69 +491,47 @@ static int futex_fd(unsigned long uaddr, int offset, int signal) ...@@ -450,69 +491,47 @@ static int futex_fd(unsigned long uaddr, int offset, int signal)
goto out; goto out;
} }
lock_futex_mm(); down_read(&current->mm->mmap_sem);
err = get_futex_key(uaddr, &key);
page = __pin_page(uaddr - offset); up_read(&current->mm->mmap_sem);
if (!page) {
unlock_futex_mm();
if (unlikely(err != 0)) {
put_unused_fd(ret); put_unused_fd(ret);
put_filp(filp); put_filp(filp);
kfree(q); kfree(q);
return -EFAULT; return err;
} }
init_waitqueue_head(&q->waiters); init_waitqueue_head(&q->waiters);
filp->private_data = q; filp->private_data = q;
__queue_me(q, page, uaddr, offset, ret, filp); queue_me(q, &key, ret, filp);
unlock_futex_mm();
/* Now we map fd to filp, so userspace can access it */ /* Now we map fd to filp, so userspace can access it */
fd_install(ret, filp); fd_install(ret, filp);
page = NULL;
out: out:
if (page)
put_page(page);
return ret; return ret;
} }
long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
unsigned long uaddr2, int val2) unsigned long uaddr2, int val2)
{ {
unsigned long pos_in_page;
int ret; int ret;
pos_in_page = uaddr % PAGE_SIZE;
/* Must be "naturally" aligned */
if (pos_in_page % sizeof(u32))
return -EINVAL;
switch (op) { switch (op) {
case FUTEX_WAIT: case FUTEX_WAIT:
ret = futex_wait(uaddr, pos_in_page, val, timeout); ret = futex_wait(uaddr, val, timeout);
break; break;
case FUTEX_WAKE: case FUTEX_WAKE:
ret = futex_wake(uaddr, pos_in_page, val); ret = futex_wake(uaddr, val);
break; break;
case FUTEX_FD: case FUTEX_FD:
/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
ret = futex_fd(uaddr, pos_in_page, val); ret = futex_fd(uaddr, val);
break; break;
case FUTEX_REQUEUE: case FUTEX_REQUEUE:
{ ret = futex_requeue(uaddr, uaddr2, val, val2);
unsigned long pos_in_page2 = uaddr2 % PAGE_SIZE;
/* Must be "naturally" aligned */
if (pos_in_page2 % sizeof(u32))
return -EINVAL;
ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2,
val, val2);
break; break;
}
default: default:
ret = -ENOSYS; ret = -ENOSYS;
} }
......
...@@ -9,6 +9,6 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ ...@@ -9,6 +9,6 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
page_alloc.o page-writeback.o pdflush.o readahead.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \
slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) slab.o swap.o truncate.o vmscan.o $(mmu-y)
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o
...@@ -144,9 +144,10 @@ long sys_remap_file_pages(unsigned long start, unsigned long size, ...@@ -144,9 +144,10 @@ long sys_remap_file_pages(unsigned long start, unsigned long size,
return err; return err;
#endif #endif
down_read(&mm->mmap_sem); /* We need down_write() to change vma->vm_flags. */
down_write(&mm->mmap_sem);
vma = find_vma(mm, start); vma = find_vma(mm, start);
/* /*
* Make sure the vma is shared, that it supports prefaulting, * Make sure the vma is shared, that it supports prefaulting,
* and that the remapped range is valid and fully within * and that the remapped range is valid and fully within
...@@ -155,11 +156,27 @@ long sys_remap_file_pages(unsigned long start, unsigned long size, ...@@ -155,11 +156,27 @@ long sys_remap_file_pages(unsigned long start, unsigned long size,
if (vma && (vma->vm_flags & VM_SHARED) && if (vma && (vma->vm_flags & VM_SHARED) &&
vma->vm_ops && vma->vm_ops->populate && vma->vm_ops && vma->vm_ops->populate &&
end > start && start >= vma->vm_start && end > start && start >= vma->vm_start &&
end <= vma->vm_end) end <= vma->vm_end) {
err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
/* Must set VM_NONLINEAR before any pages are populated. */
if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
vma->vm_flags |= VM_NONLINEAR;
/* ->populate can take a long time, so downgrade the lock. */
downgrade_write(&mm->mmap_sem);
err = vma->vm_ops->populate(vma, start, size,
vma->vm_page_prot,
pgoff, flags & MAP_NONBLOCK); pgoff, flags & MAP_NONBLOCK);
/*
* We can't clear VM_NONLINEAR because we'd have to do
* it after ->populate completes, and that would prevent
* downgrading the lock. (Locks can't be upgraded).
*/
up_read(&mm->mmap_sem); up_read(&mm->mmap_sem);
} else {
up_write(&mm->mmap_sem);
}
return err; return err;
} }
......
...@@ -43,7 +43,6 @@ ...@@ -43,7 +43,6 @@
#include <linux/swap.h> #include <linux/swap.h>
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/vcache.h>
#include <linux/rmap-locking.h> #include <linux/rmap-locking.h>
#include <linux/module.h> #include <linux/module.h>
...@@ -962,7 +961,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr ...@@ -962,7 +961,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address,
pte_t *page_table) pte_t *page_table)
{ {
invalidate_vcache(address, vma->vm_mm, new_page);
flush_cache_page(vma, address); flush_cache_page(vma, address);
establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot)))); establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
} }
......
/*
* linux/mm/vcache.c
*
* virtual => physical page mapping cache. Users of this mechanism
* register callbacks for a given (virt,mm,phys) page mapping, and
* the kernel guarantees to call back when this mapping is invalidated.
* (ie. upon COW or unmap.)
*
* Started by Ingo Molnar, Copyright (C) 2002
*/
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/vcache.h>
#define VCACHE_HASHBITS 8
#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
static struct list_head hash[VCACHE_HASHSIZE];
static struct list_head *hash_vcache(unsigned long address,
struct mm_struct *mm)
{
return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
}
void __attach_vcache(vcache_t *vcache,
unsigned long address,
struct mm_struct *mm,
void (*callback)(struct vcache_s *data, struct page *new))
{
struct list_head *hash_head;
address &= PAGE_MASK;
vcache->address = address;
vcache->mm = mm;
vcache->callback = callback;
hash_head = hash_vcache(address, mm);
list_add_tail(&vcache->hash_entry, hash_head);
}
void __detach_vcache(vcache_t *vcache)
{
list_del_init(&vcache->hash_entry);
}
void invalidate_vcache(unsigned long address, struct mm_struct *mm,
struct page *new_page)
{
struct list_head *l, *hash_head;
vcache_t *vcache;
address &= PAGE_MASK;
hash_head = hash_vcache(address, mm);
/*
* This is safe, because this path is called with the pagetable
* lock held. So while other mm's might add new entries in
* parallel, *this* mm is locked out, so if the list is empty
* now then we do not have to take the vcache lock to see it's
* really empty.
*/
if (likely(list_empty(hash_head)))
return;
spin_lock(&vcache_lock);
list_for_each(l, hash_head) {
vcache = list_entry(l, vcache_t, hash_entry);
if (vcache->address != address || vcache->mm != mm)
continue;
vcache->callback(vcache, new_page);
}
spin_unlock(&vcache_lock);
}
static int __init vcache_init(void)
{
unsigned int i;
for (i = 0; i < VCACHE_HASHSIZE; i++)
INIT_LIST_HEAD(hash + i);
return 0;
}
__initcall(vcache_init);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment