Commit 109811c2 authored by Jan Kara's avatar Jan Kara Committed by Theodore Ts'o

ext4: simplify io_end handling for AIO DIO

When mapping blocks for direct IO, we allocate io_end structure before
mapping blocks and store pointer to it in the inode. This creates a
requirement that any AIO DIO using io_end must be protected by i_mutex.
This created problems in the past with dioread_nolock mode which was
corrupting io_end pointers. Also io_end is allocated unnecessarily in
case where we don't need to convert any extents (which is a common case
for example when overwriting file).

We fix the problem by allocating io_end only once we return unwritten
extent from block mapping function for AIO DIO (so we can save some
pointless io_end allocations) and we pass pointer to it in bh->b_private
which generic DIO code later passes to our end IO callback. That way we
remove any need for global pointer to io_end structure and thus fix the
races.

The downside of this change is that the checking for unwritten IO in
flight in ext4_extents_can_be_merged() is more racy since we now
increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after dropping
i_data_sem. However the check has been racy already before because
ext4_writepages() already increment i_unwritten after dropping
i_data_sem and reserved blocks save us from hitting ENOSPC in the worst
case.
Signed-off-by: default avatarJan Kara <jack@suse.cz>
parent efe70c29
...@@ -1513,16 +1513,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode, ...@@ -1513,16 +1513,6 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
} }
} }
static inline ext4_io_end_t *ext4_inode_aio(struct inode *inode)
{
return inode->i_private;
}
static inline void ext4_inode_aio_set(struct inode *inode, ext4_io_end_t *io)
{
inode->i_private = io;
}
/* /*
* Inode dynamic state flags * Inode dynamic state flags
*/ */
......
...@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, ...@@ -1736,6 +1736,12 @@ ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1,
*/ */
if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN) if (ext1_ee_len + ext2_ee_len > EXT_INIT_MAX_LEN)
return 0; return 0;
/*
* The check for IO to unwritten extent is somewhat racy as we
* increment i_unwritten / set EXT4_STATE_DIO_UNWRITTEN only after
* dropping i_data_sem. But reserved blocks should save us in that
* case.
*/
if (ext4_ext_is_unwritten(ex1) && if (ext4_ext_is_unwritten(ex1) &&
(ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) || (ext4_test_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN) ||
atomic_read(&EXT4_I(inode)->i_unwritten) || atomic_read(&EXT4_I(inode)->i_unwritten) ||
...@@ -4007,7 +4013,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, ...@@ -4007,7 +4013,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
struct ext4_ext_path *path = *ppath; struct ext4_ext_path *path = *ppath;
int ret = 0; int ret = 0;
int err = 0; int err = 0;
ext4_io_end_t *io = ext4_inode_aio(inode);
ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical "
"block %llu, max_blocks %u, flags %x, allocated %u\n", "block %llu, max_blocks %u, flags %x, allocated %u\n",
...@@ -4030,15 +4035,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, ...@@ -4030,15 +4035,6 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
flags | EXT4_GET_BLOCKS_CONVERT); flags | EXT4_GET_BLOCKS_CONVERT);
if (ret <= 0) if (ret <= 0)
goto out; goto out;
/*
* Flag the inode(non aio case) or end_io struct (aio case)
* that this IO needs to conversion to written when IO is
* completed
*/
if (io)
ext4_set_io_unwritten_flag(inode, io);
else
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
map->m_flags |= EXT4_MAP_UNWRITTEN; map->m_flags |= EXT4_MAP_UNWRITTEN;
goto out; goto out;
} }
...@@ -4283,9 +4279,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -4283,9 +4279,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
unsigned int allocated = 0, offset = 0; unsigned int allocated = 0, offset = 0;
unsigned int allocated_clusters = 0; unsigned int allocated_clusters = 0;
struct ext4_allocation_request ar; struct ext4_allocation_request ar;
ext4_io_end_t *io = ext4_inode_aio(inode);
ext4_lblk_t cluster_offset; ext4_lblk_t cluster_offset;
int set_unwritten = 0;
bool map_from_cluster = false; bool map_from_cluster = false;
ext_debug("blocks %u/%u requested for inode %lu\n", ext_debug("blocks %u/%u requested for inode %lu\n",
...@@ -4482,15 +4476,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -4482,15 +4476,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT){
ext4_ext_mark_unwritten(&newex); ext4_ext_mark_unwritten(&newex);
map->m_flags |= EXT4_MAP_UNWRITTEN; map->m_flags |= EXT4_MAP_UNWRITTEN;
/*
* io_end structure was created for every IO write to an
* unwritten extent. To avoid unnecessary conversion,
* here we flag the IO that really needs the conversion.
* For non asycn direct IO case, flag the inode state
* that we need to perform conversion when IO is done.
*/
if (flags & EXT4_GET_BLOCKS_PRE_IO)
set_unwritten = 1;
} }
err = 0; err = 0;
...@@ -4501,14 +4486,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ...@@ -4501,14 +4486,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
err = ext4_ext_insert_extent(handle, inode, &path, err = ext4_ext_insert_extent(handle, inode, &path,
&newex, flags); &newex, flags);
if (!err && set_unwritten) {
if (io)
ext4_set_io_unwritten_flag(inode, io);
else
ext4_set_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN);
}
if (err && free_on_err) { if (err && free_on_err) {
int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ? int fb_flags = flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE ?
EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0; EXT4_FREE_BLOCKS_NO_QUOT_UPDATE : 0;
......
...@@ -797,18 +797,16 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock, ...@@ -797,18 +797,16 @@ int ext4_dio_get_block(struct inode *inode, sector_t iblock,
} }
/* /*
* Get block function for DIO writes when we create unwritten extent if * Get block function for AIO DIO writes when we create unwritten extent if
* blocks are not allocated yet. The extent will be converted to written * blocks are not allocated yet. The extent will be converted to written
* after IO is complete. * after IO is complete.
*/ */
static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, static int ext4_dio_get_block_unwritten_async(struct inode *inode,
struct buffer_head *bh_result, int create) sector_t iblock, struct buffer_head *bh_result, int create)
{ {
handle_t *handle; handle_t *handle;
int ret; int ret;
ext4_debug("ext4_dio_get_block_unwritten: inode %lu, create flag %d\n",
inode->i_ino, create);
/* We don't expect handle for direct IO */ /* We don't expect handle for direct IO */
WARN_ON_ONCE(ext4_journal_current_handle()); WARN_ON_ONCE(ext4_journal_current_handle());
...@@ -818,16 +816,62 @@ static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock, ...@@ -818,16 +816,62 @@ static int ext4_dio_get_block_unwritten(struct inode *inode, sector_t iblock,
ret = _ext4_get_block(inode, iblock, bh_result, ret = _ext4_get_block(inode, iblock, bh_result,
EXT4_GET_BLOCKS_IO_CREATE_EXT); EXT4_GET_BLOCKS_IO_CREATE_EXT);
ext4_journal_stop(handle); ext4_journal_stop(handle);
if (!ret && buffer_unwritten(bh_result)) {
ext4_io_end_t *io_end = ext4_inode_aio(inode);
/*
* When doing DIO using unwritten extents, we need io_end to convert
* unwritten extents to written on IO completion. We allocate io_end
* once we spot unwritten extent and store it in b_private. Generic
* DIO code keeps b_private set and furthermore passes the value to
* our completion callback in 'private' argument.
*/
if (!ret && buffer_unwritten(bh_result)) {
if (!bh_result->b_private) {
ext4_io_end_t *io_end;
io_end = ext4_init_io_end(inode, GFP_KERNEL);
if (!io_end)
return -ENOMEM;
bh_result->b_private = io_end;
ext4_set_io_unwritten_flag(inode, io_end);
}
set_buffer_defer_completion(bh_result); set_buffer_defer_completion(bh_result);
WARN_ON_ONCE(io_end && !(io_end->flag & EXT4_IO_END_UNWRITTEN));
} }
return ret; return ret;
} }
/*
* Get block function for non-AIO DIO writes when we create unwritten extent if
* blocks are not allocated yet. The extent will be converted to written
* after IO is complete from ext4_ext_direct_IO() function.
*/
static int ext4_dio_get_block_unwritten_sync(struct inode *inode,
sector_t iblock, struct buffer_head *bh_result, int create)
{
handle_t *handle;
int ret;
/* We don't expect handle for direct IO */
WARN_ON_ONCE(ext4_journal_current_handle());
handle = start_dio_trans(inode, bh_result);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = _ext4_get_block(inode, iblock, bh_result,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
ext4_journal_stop(handle);
/*
* Mark inode as having pending DIO writes to unwritten extents.
* ext4_ext_direct_IO() checks this flag and converts extents to
* written.
*/
if (!ret && buffer_unwritten(bh_result))
ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
return ret;
}
static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock, static int ext4_dio_get_block_overwrite(struct inode *inode, sector_t iblock,
struct buffer_head *bh_result, int create) struct buffer_head *bh_result, int create)
{ {
...@@ -3243,7 +3287,7 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, ...@@ -3243,7 +3287,7 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private) ssize_t size, void *private)
{ {
ext4_io_end_t *io_end = iocb->private; ext4_io_end_t *io_end = private;
/* if not async direct IO just return */ /* if not async direct IO just return */
if (!io_end) if (!io_end)
...@@ -3251,10 +3295,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ...@@ -3251,10 +3295,8 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ext_debug("ext4_end_io_dio(): io_end 0x%p " ext_debug("ext4_end_io_dio(): io_end 0x%p "
"for inode %lu, iocb 0x%p, offset %llu, size %zd\n", "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
iocb->private, io_end->inode->i_ino, iocb, offset, io_end, io_end->inode->i_ino, iocb, offset, size);
size);
iocb->private = NULL;
io_end->offset = offset; io_end->offset = offset;
io_end->size = size; io_end->size = size;
ext4_put_io_end(io_end); ext4_put_io_end(io_end);
...@@ -3290,7 +3332,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -3290,7 +3332,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_t *get_block_func = NULL; get_block_t *get_block_func = NULL;
int dio_flags = 0; int dio_flags = 0;
loff_t final_size = offset + count; loff_t final_size = offset + count;
ext4_io_end_t *io_end = NULL;
/* Use the old path for reads and writes beyond i_size. */ /* Use the old path for reads and writes beyond i_size. */
if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size)
...@@ -3315,16 +3356,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -3315,16 +3356,17 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
/* /*
* We could direct write to holes and fallocate. * We could direct write to holes and fallocate.
* *
* Allocated blocks to fill the hole are marked as * Allocated blocks to fill the hole are marked as unwritten to prevent
* unwritten to prevent parallel buffered read to expose * parallel buffered read to expose the stale data before DIO complete
* the stale data before DIO complete the data IO. * the data IO.
* *
* As to previously fallocated extents, ext4 get_block will * As to previously fallocated extents, ext4 get_block will just simply
* just simply mark the buffer mapped but still keep the * mark the buffer mapped but still keep the extents unwritten.
* extents unwritten.
* *
* For non AIO case, we will convert those unwritten extents * For non AIO case, we will convert those unwritten extents to written
* to written after return back from blockdev_direct_IO. * after return back from blockdev_direct_IO. That way we save us from
* allocating io_end structure and also the overhead of offloading
* the extent convertion to a workqueue.
* *
* For async DIO, the conversion needs to be deferred when the * For async DIO, the conversion needs to be deferred when the
* IO is completed. The ext4 end_io callback function will be * IO is completed. The ext4 end_io callback function will be
...@@ -3332,30 +3374,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -3332,30 +3374,13 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
* case, we allocate an io_end structure to hook to the iocb. * case, we allocate an io_end structure to hook to the iocb.
*/ */
iocb->private = NULL; iocb->private = NULL;
if (overwrite) { if (overwrite)
get_block_func = ext4_dio_get_block_overwrite; get_block_func = ext4_dio_get_block_overwrite;
else if (is_sync_kiocb(iocb)) {
get_block_func = ext4_dio_get_block_unwritten_sync;
dio_flags = DIO_LOCKING;
} else { } else {
ext4_inode_aio_set(inode, NULL); get_block_func = ext4_dio_get_block_unwritten_async;
if (!is_sync_kiocb(iocb)) {
io_end = ext4_init_io_end(inode, GFP_NOFS);
if (!io_end) {
ret = -ENOMEM;
goto retake_lock;
}
/*
* Grab reference for DIO. Will be dropped in
* ext4_end_io_dio()
*/
iocb->private = ext4_get_io_end(io_end);
/*
* we save the io structure for current async direct
* IO, so that later ext4_map_blocks() could flag the
* io structure whether there is a unwritten extents
* needs to be converted when IO is completed.
*/
ext4_inode_aio_set(inode, io_end);
}
get_block_func = ext4_dio_get_block_unwritten;
dio_flags = DIO_LOCKING; dio_flags = DIO_LOCKING;
} }
#ifdef CONFIG_EXT4_FS_ENCRYPTION #ifdef CONFIG_EXT4_FS_ENCRYPTION
...@@ -3370,27 +3395,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -3370,27 +3395,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
get_block_func, get_block_func,
ext4_end_io_dio, NULL, dio_flags); ext4_end_io_dio, NULL, dio_flags);
/*
* Put our reference to io_end. This can free the io_end structure e.g.
* in sync IO case or in case of error. It can even perform extent
* conversion if all bios we submitted finished before we got here.
* Note that in that case iocb->private can be already set to NULL
* here.
*/
if (io_end) {
ext4_inode_aio_set(inode, NULL);
ext4_put_io_end(io_end);
/*
* When no IO was submitted ext4_end_io_dio() was not
* called so we have to put iocb's reference.
*/
if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
WARN_ON(iocb->private != io_end);
WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
ext4_put_io_end(io_end);
iocb->private = NULL;
}
}
if (ret > 0 && !overwrite && ext4_test_inode_state(inode, if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) { EXT4_STATE_DIO_UNWRITTEN)) {
int err; int err;
...@@ -3405,7 +3409,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ...@@ -3405,7 +3409,6 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
} }
retake_lock:
if (iov_iter_rw(iter) == WRITE) if (iov_iter_rw(iter) == WRITE)
inode_dio_end(inode); inode_dio_end(inode);
/* take i_mutex locking again if we do a ovewrite dio */ /* take i_mutex locking again if we do a ovewrite dio */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment