Commit 248766f0 authored by Jan Kara's avatar Jan Kara Committed by Sasha Levin

ext4: fix races between page faults and hole punching

Currently, page faults and hole punching are completely unsynchronized.
This can result in page fault faulting in a page into a range that we
are punching after truncate_pagecache_range() has been called and thus
we can end up with a page mapped to disk blocks that will be shortly
freed. Filesystem corruption will shortly follow. Note that the same
race is avoided for truncate by checking page fault offset against
i_size but there isn't similar mechanism available for punching holes.

Fix the problem by creating new rw semaphore i_mmap_sem in inode and
grab it for writing over truncate, hole punching, and other functions
removing blocks from extent tree and for read over page faults. We
cannot easily use i_data_sem for this since that ranks below transaction
start and we need something ranking above it so that it can be held over
the whole truncate / hole punching operation. Also remove various
workarounds we had in the code to reduce race window when page fault
could have created pages with stale mapping information.
Signed-off-by: default avatarJan Kara <jack@suse.com>
Signed-off-by: default avatarTheodore Ts'o <tytso@mit.edu>
Reviewed-by: default avatarMingming Cao <mingming.cao@oracle.com>
Signed-off-by: default avatarSasha Levin <sasha.levin@oracle.com>
parent 14b4d141
......@@ -873,6 +873,15 @@ struct ext4_inode_info {
* by other means, so we have i_data_sem.
*/
struct rw_semaphore i_data_sem;
/*
* i_mmap_sem is for serializing page faults with truncate / punch hole
* operations. We have to make sure that new page cannot be faulted in
* a section of the inode that is being punched. We cannot easily use
* i_data_sem for this since we need protection for the whole punch
* operation and i_data_sem ranks below transaction start so we have
* to occasionally drop it.
*/
struct rw_semaphore i_mmap_sem;
struct inode vfs_inode;
struct jbd2_inode *jinode;
......@@ -2287,6 +2296,7 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
extern void ext4_da_update_reserve_space(struct inode *inode,
int used, int quota_claim);
......
......@@ -4741,7 +4741,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
int partial_begin, partial_end;
loff_t start, end;
ext4_lblk_t lblk;
struct address_space *mapping = inode->i_mapping;
unsigned int blkbits = inode->i_blkbits;
trace_ext4_zero_range(inode, offset, len, mode);
......@@ -4756,17 +4755,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
return ret;
}
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
*/
if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
ret = filemap_write_and_wait_range(mapping, offset,
offset + len - 1);
if (ret)
return ret;
}
/*
* Round up offset. This is not fallocate, we neet to zero out
* blocks, so convert interior block aligned part of the range to
......@@ -4827,16 +4815,22 @@ static long ext4_zero_range(struct file *file, loff_t offset,
flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
EXT4_EX_NOCACHE);
/* Now release the pages and zero block aligned part of pages*/
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
* Prevent page faults from reinstantiating pages we have
* released from page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
/* Now release the pages and zero block aligned part of pages */
truncate_pagecache_range(inode, start, end - 1);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
flags, mode);
up_write(&EXT4_I(inode)->i_mmap_sem);
if (ret)
goto out_dio;
}
......@@ -5454,17 +5448,22 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
goto out_mutex;
}
truncate_pagecache(inode, ioffset);
/* Wait for existing dio to complete */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_pagecache(inode, ioffset);
credits = ext4_writepage_trans_blocks(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
goto out_dio;
goto out_mmap;
}
down_write(&EXT4_I(inode)->i_data_sem);
......@@ -5503,7 +5502,8 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
out_stop:
ext4_journal_stop(handle);
out_dio:
out_mmap:
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
......
......@@ -213,7 +213,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
#endif
static const struct vm_operations_struct ext4_file_vm_ops = {
.fault = filemap_fault,
.fault = ext4_filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = ext4_page_mkwrite,
};
......
......@@ -3588,6 +3588,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
}
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
*/
down_write(&EXT4_I(inode)->i_mmap_sem);
first_block_offset = round_up(offset, sb->s_blocksize);
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
......@@ -3596,10 +3605,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
/* Wait all existing dio workers, newcomers will block on i_mutex */
ext4_inode_block_unlocked_dio(inode);
inode_dio_wait(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
credits = ext4_writepage_trans_blocks(inode);
else
......@@ -3645,16 +3650,12 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
if (IS_SYNC(inode))
ext4_handle_sync(handle);
/* Now release the pages again to reduce race window */
if (last_block_offset > first_block_offset)
truncate_pagecache_range(inode, first_block_offset,
last_block_offset);
inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
ext4_mark_inode_dirty(handle, inode);
out_stop:
ext4_journal_stop(handle);
out_dio:
up_write(&EXT4_I(inode)->i_mmap_sem);
ext4_inode_resume_unlocked_dio(inode);
out_mutex:
mutex_unlock(&inode->i_mutex);
......@@ -4775,11 +4776,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
} else
ext4_wait_for_tail_page_commit(inode);
}
down_write(&EXT4_I(inode)->i_mmap_sem);
/*
* Truncate pagecache after we've waited for commit
* in data=journal mode to make pages freeable.
*/
truncate_pagecache(inode, inode->i_size);
up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
* We want to call ext4_truncate() even if attr->ia_size ==
......@@ -5234,6 +5237,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
sb_start_pagefault(inode->i_sb);
file_update_time(vma->vm_file);
down_read(&EXT4_I(inode)->i_mmap_sem);
/* Delalloc case is easy... */
if (test_opt(inode->i_sb, DELALLOC) &&
!ext4_should_journal_data(inode) &&
......@@ -5303,6 +5308,19 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
out_ret:
ret = block_page_mkwrite_return(ret);
out:
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
return ret;
}
int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
{
struct inode *inode = file_inode(vma->vm_file);
int err;
down_read(&EXT4_I(inode)->i_mmap_sem);
err = filemap_fault(vma, vmf);
up_read(&EXT4_I(inode)->i_mmap_sem);
return err;
}
......@@ -945,6 +945,7 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&ei->i_orphan);
init_rwsem(&ei->xattr_sem);
init_rwsem(&ei->i_data_sem);
init_rwsem(&ei->i_mmap_sem);
inode_init_once(&ei->vfs_inode);
}
......
......@@ -10,8 +10,10 @@
*/
static inline void ext4_truncate_failed_write(struct inode *inode)
{
down_write(&EXT4_I(inode)->i_mmap_sem);
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
up_write(&EXT4_I(inode)->i_mmap_sem);
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment