Commit 84317297 authored by Matthew Wilcox's avatar Matthew Wilcox Committed by Linus Torvalds

dax: fix race between simultaneous faults

If two threads write-fault on the same hole at the same time, the winner
of the race will return to userspace and complete their store, only to
have the loser overwrite their store with zeroes.  Fix this for now by
taking the i_mmap_sem for write instead of read, and do so outside the
call to get_block().  Now the loser of the race will see the block has
already been zeroed, and will not zero it again.

This severely limits our scalability.  I have ideas for improving it, but
those can wait for a later patch.
Signed-off-by: default avatarMatthew Wilcox <willy@linux.intel.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 01a33b4a
...@@ -272,7 +272,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, ...@@ -272,7 +272,6 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh,
static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
struct vm_area_struct *vma, struct vm_fault *vmf) struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
struct address_space *mapping = inode->i_mapping;
sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
unsigned long vaddr = (unsigned long)vmf->virtual_address; unsigned long vaddr = (unsigned long)vmf->virtual_address;
void *addr; void *addr;
...@@ -280,8 +279,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, ...@@ -280,8 +279,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
pgoff_t size; pgoff_t size;
int error; int error;
i_mmap_lock_read(mapping);
/* /*
* Check truncate didn't happen while we were allocating a block. * Check truncate didn't happen while we were allocating a block.
* If it did, this block may or may not be still allocated to the * If it did, this block may or may not be still allocated to the
...@@ -309,8 +306,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, ...@@ -309,8 +306,6 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
error = vm_insert_mixed(vma, vaddr, pfn); error = vm_insert_mixed(vma, vaddr, pfn);
out: out:
i_mmap_unlock_read(mapping);
return error; return error;
} }
...@@ -372,15 +367,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -372,15 +367,17 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
* from a read fault and we've raced with a truncate * from a read fault and we've raced with a truncate
*/ */
error = -EIO; error = -EIO;
goto unlock_page; goto unlock;
} }
} else {
i_mmap_lock_write(mapping);
} }
error = get_block(inode, block, &bh, 0); error = get_block(inode, block, &bh, 0);
if (!error && (bh.b_size < PAGE_SIZE)) if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; /* fs corruption? */ error = -EIO; /* fs corruption? */
if (error) if (error)
goto unlock_page; goto unlock;
if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
if (vmf->flags & FAULT_FLAG_WRITE) { if (vmf->flags & FAULT_FLAG_WRITE) {
...@@ -391,8 +388,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -391,8 +388,9 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
if (!error && (bh.b_size < PAGE_SIZE)) if (!error && (bh.b_size < PAGE_SIZE))
error = -EIO; error = -EIO;
if (error) if (error)
goto unlock_page; goto unlock;
} else { } else {
i_mmap_unlock_write(mapping);
return dax_load_hole(mapping, page, vmf); return dax_load_hole(mapping, page, vmf);
} }
} }
...@@ -404,17 +402,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -404,17 +402,15 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
else else
clear_user_highpage(new_page, vaddr); clear_user_highpage(new_page, vaddr);
if (error) if (error)
goto unlock_page; goto unlock;
vmf->page = page; vmf->page = page;
if (!page) { if (!page) {
i_mmap_lock_read(mapping);
/* Check we didn't race with truncate */ /* Check we didn't race with truncate */
size = (i_size_read(inode) + PAGE_SIZE - 1) >> size = (i_size_read(inode) + PAGE_SIZE - 1) >>
PAGE_SHIFT; PAGE_SHIFT;
if (vmf->pgoff >= size) { if (vmf->pgoff >= size) {
i_mmap_unlock_read(mapping);
error = -EIO; error = -EIO;
goto out; goto unlock;
} }
} }
return VM_FAULT_LOCKED; return VM_FAULT_LOCKED;
...@@ -450,6 +446,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -450,6 +446,8 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
} }
if (!page)
i_mmap_unlock_write(mapping);
out: out:
if (error == -ENOMEM) if (error == -ENOMEM)
return VM_FAULT_OOM | major; return VM_FAULT_OOM | major;
...@@ -458,11 +456,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, ...@@ -458,11 +456,14 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
return VM_FAULT_SIGBUS | major; return VM_FAULT_SIGBUS | major;
return VM_FAULT_NOPAGE | major; return VM_FAULT_NOPAGE | major;
unlock_page: unlock:
if (page) { if (page) {
unlock_page(page); unlock_page(page);
page_cache_release(page); page_cache_release(page);
} else {
i_mmap_unlock_write(mapping);
} }
goto out; goto out;
} }
EXPORT_SYMBOL(__dax_fault); EXPORT_SYMBOL(__dax_fault);
...@@ -540,10 +541,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -540,10 +541,10 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
bh.b_size = PMD_SIZE; bh.b_size = PMD_SIZE;
i_mmap_lock_write(mapping);
length = get_block(inode, block, &bh, write); length = get_block(inode, block, &bh, write);
if (length) if (length)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
i_mmap_lock_read(mapping);
/* /*
* If the filesystem isn't willing to tell us the length of a hole, * If the filesystem isn't willing to tell us the length of a hole,
...@@ -607,11 +608,11 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -607,11 +608,11 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
} }
out: out:
i_mmap_unlock_read(mapping);
if (buffer_unwritten(&bh)) if (buffer_unwritten(&bh))
complete_unwritten(&bh, !(result & VM_FAULT_ERROR)); complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
i_mmap_unlock_write(mapping);
return result; return result;
fallback: fallback:
......
...@@ -2427,10 +2427,15 @@ void unmap_mapping_range(struct address_space *mapping, ...@@ -2427,10 +2427,15 @@ void unmap_mapping_range(struct address_space *mapping,
details.last_index = ULONG_MAX; details.last_index = ULONG_MAX;
/* DAX uses i_mmap_lock to serialise file truncate vs page fault */ /*
* DAX already holds i_mmap_lock to serialise file truncate vs
* page fault and page fault vs page fault.
*/
if (!IS_DAX(mapping->host))
i_mmap_lock_write(mapping); i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap))) if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details); unmap_mapping_range_tree(&mapping->i_mmap, &details);
if (!IS_DAX(mapping->host))
i_mmap_unlock_write(mapping); i_mmap_unlock_write(mapping);
} }
EXPORT_SYMBOL(unmap_mapping_range); EXPORT_SYMBOL(unmap_mapping_range);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment