Commit 3e63f0be authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Fix vmtruncate race and distributed filesystem race

From: Dave McCracken <dmccr@us.ibm.com>

This patch solves the race between truncate and page in which can cause stray
anon pages to appear in the truncated region.

The race occurs when a process is sleeping in pagein IO during the truncate:
there's a window after checking i_size in which the paging-in process decides
that the page was an OK one.

This leaves an anon page in the pagetables, and if the file is subsequently
extended we have an anon page floating about inside a file-backed mmap - user
modifications will not be written out.

Apparently this is also needed for the implementation of POSIX semantics for
distributed filesystems.

We use a generation counter in the address_space so the paging-in process can
determine whether there was a truncate which might have shot the new page
down.

It's a bit grubby to be playing with files and inodes in do_no_page(), but we
do need the page_table_lock coverage for this, and rearranging thngs to
provide that coverage to filemap_nopage wasn't very nice either.
parent 5096494f
...@@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void) ...@@ -1189,6 +1189,7 @@ static int __init init_blkmtd(void)
INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages); INIT_LIST_HEAD(&mtd_rawdevice->as.locked_pages);
mtd_rawdevice->as.host = NULL; mtd_rawdevice->as.host = NULL;
init_MUTEX(&(mtd_rawdevice->as.i_shared_sem)); init_MUTEX(&(mtd_rawdevice->as.i_shared_sem));
atomic_set(&(mtd_rawdevice->as.truncate_count), 0);
mtd_rawdevice->as.a_ops = &blkmtd_aops; mtd_rawdevice->as.a_ops = &blkmtd_aops;
INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap); INIT_LIST_HEAD(&mtd_rawdevice->as.i_mmap);
......
...@@ -184,6 +184,7 @@ void inode_init_once(struct inode *inode) ...@@ -184,6 +184,7 @@ void inode_init_once(struct inode *inode)
INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC); INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
spin_lock_init(&inode->i_data.page_lock); spin_lock_init(&inode->i_data.page_lock);
init_MUTEX(&inode->i_data.i_shared_sem); init_MUTEX(&inode->i_data.i_shared_sem);
atomic_set(&inode->i_data.truncate_count, 0);
INIT_LIST_HEAD(&inode->i_data.private_list); INIT_LIST_HEAD(&inode->i_data.private_list);
spin_lock_init(&inode->i_data.private_lock); spin_lock_init(&inode->i_data.private_lock);
INIT_LIST_HEAD(&inode->i_data.i_mmap); INIT_LIST_HEAD(&inode->i_data.i_mmap);
......
...@@ -323,6 +323,7 @@ struct address_space { ...@@ -323,6 +323,7 @@ struct address_space {
struct list_head i_mmap; /* list of private mappings */ struct list_head i_mmap; /* list of private mappings */
struct list_head i_mmap_shared; /* list of shared mappings */ struct list_head i_mmap_shared; /* list of shared mappings */
struct semaphore i_shared_sem; /* protect both above lists */ struct semaphore i_shared_sem; /* protect both above lists */
atomic_t truncate_count; /* Cover race condition with truncate */
unsigned long dirtied_when; /* jiffies of first page dirtying */ unsigned long dirtied_when; /* jiffies of first page dirtying */
int gfp_mask; /* how to allocate the pages */ int gfp_mask; /* how to allocate the pages */
struct backing_dev_info *backing_dev_info; /* device readahead, etc */ struct backing_dev_info *backing_dev_info; /* device readahead, etc */
......
...@@ -1126,6 +1126,8 @@ void invalidate_mmap_range(struct address_space *mapping, ...@@ -1126,6 +1126,8 @@ void invalidate_mmap_range(struct address_space *mapping,
hlen = ULONG_MAX - hba + 1; hlen = ULONG_MAX - hba + 1;
} }
down(&mapping->i_shared_sem); down(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!list_empty(&mapping->i_mmap))) if (unlikely(!list_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen); invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
if (unlikely(!list_empty(&mapping->i_mmap_shared))) if (unlikely(!list_empty(&mapping->i_mmap_shared)))
...@@ -1378,8 +1380,10 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1378,8 +1380,10 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd) unsigned long address, int write_access, pte_t *page_table, pmd_t *pmd)
{ {
struct page * new_page; struct page * new_page;
struct address_space *mapping;
pte_t entry; pte_t entry;
struct pte_chain *pte_chain; struct pte_chain *pte_chain;
int sequence;
int ret; int ret;
if (!vma->vm_ops || !vma->vm_ops->nopage) if (!vma->vm_ops || !vma->vm_ops->nopage)
...@@ -1388,6 +1392,10 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1388,6 +1392,10 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap(page_table); pte_unmap(page_table);
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
mapping = vma->vm_file->f_dentry->d_inode->i_mapping;
sequence = atomic_read(&mapping->truncate_count);
smp_rmb(); /* Prevent CPU from reordering lock-free ->nopage() */
retry:
new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0); new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, 0);
/* no page was available -- either SIGBUS or OOM */ /* no page was available -- either SIGBUS or OOM */
...@@ -1416,6 +1424,17 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1416,6 +1424,17 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
} }
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If invalidate_mmap_range got called,
* retry getting the page.
*/
if (unlikely(sequence != atomic_read(&mapping->truncate_count))) {
sequence = atomic_read(&mapping->truncate_count);
spin_unlock(&mm->page_table_lock);
page_cache_release(new_page);
goto retry;
}
page_table = pte_offset_map(pmd, address); page_table = pte_offset_map(pmd, address);
/* /*
......
...@@ -35,6 +35,7 @@ struct address_space swapper_space = { ...@@ -35,6 +35,7 @@ struct address_space swapper_space = {
.i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap), .i_mmap = LIST_HEAD_INIT(swapper_space.i_mmap),
.i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared), .i_mmap_shared = LIST_HEAD_INIT(swapper_space.i_mmap_shared),
.i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem), .i_shared_sem = __MUTEX_INITIALIZER(swapper_space.i_shared_sem),
.truncate_count = ATOMIC_INIT(0),
.private_lock = SPIN_LOCK_UNLOCKED, .private_lock = SPIN_LOCK_UNLOCKED,
.private_list = LIST_HEAD_INIT(swapper_space.private_list), .private_list = LIST_HEAD_INIT(swapper_space.private_list),
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment