Commit 332391a9 authored by Lukas Czerner's avatar Lukas Czerner Committed by Jens Axboe

fs: Fix page cache inconsistency when mixing buffered and AIO DIO

Currently when mixing buffered reads and asynchronous direct writes it
is possible to end up with the situation where we have stale data in the
page cache while the new data is already written to disk. This is
permanent until the affected pages are flushed away. Despite the fact
that mixing buffered and direct IO is ill-advised it does pose a thread
for a data integrity, is unexpected and should be fixed.

Fix this by deferring completion of asynchronous direct writes to a
process context in the case that there are mapped pages to be found in
the inode. Later before the completion in dio_complete() invalidate
the pages in question. This ensures that after the completion the pages
in the written area are either unmapped, or populated with up-to-date
data. Also do the same for the iomap case which uses
iomap_dio_complete() instead.

This has a side effect of deferring the completion to a process context
for every AIO DIO that happens on inode that has pages mapped. However
since the consensus is that this is ill-advised practice the performance
implication should not be a problem.

This was based on proposal from Jeff Moyer, thanks!
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Reviewed-by: default avatarDarrick J. Wong <darrick.wong@oracle.com>
Reviewed-by: default avatarJeff Moyer <jmoyer@redhat.com>
Signed-off-by: default avatarLukas Czerner <lczerner@redhat.com>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent bb1cc747
...@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) ...@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
{ {
loff_t offset = dio->iocb->ki_pos; loff_t offset = dio->iocb->ki_pos;
ssize_t transferred = 0; ssize_t transferred = 0;
int err;
/* /*
* AIO submission can race with bio completion to get here while * AIO submission can race with bio completion to get here while
...@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async) ...@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
if (ret == 0) if (ret == 0)
ret = transferred; ret = transferred;
/*
* Try again to invalidate clean pages which might have been cached by
* non-direct readahead, or faulted in by get_user_pages() if the source
* of the write was an mmap'ed region of the file we're writing. Either
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
*/
if (ret > 0 && dio->op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages) {
err = invalidate_inode_pages2_range(dio->inode->i_mapping,
offset >> PAGE_SHIFT,
(offset + ret - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(err);
}
if (dio->end_io) { if (dio->end_io) {
int err;
// XXX: ki_pos?? // XXX: ki_pos??
err = dio->end_io(dio->iocb, offset, ret, dio->private); err = dio->end_io(dio->iocb, offset, ret, dio->private);
...@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio) ...@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
struct dio *dio = bio->bi_private; struct dio *dio = bio->bi_private;
unsigned long remaining; unsigned long remaining;
unsigned long flags; unsigned long flags;
bool defer_completion = false;
/* cleanup the bio */ /* cleanup the bio */
dio_bio_complete(dio, bio); dio_bio_complete(dio, bio);
...@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio) ...@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
spin_unlock_irqrestore(&dio->bio_lock, flags); spin_unlock_irqrestore(&dio->bio_lock, flags);
if (remaining == 0) { if (remaining == 0) {
if (dio->result && dio->defer_completion) { /*
* Defer completion when defer_completion is set or
* when the inode has pages mapped and this is AIO write.
* We need to invalidate those pages because there is a
* chance they contain stale data in the case buffered IO
* went in between AIO submission and completion into the
* same region.
*/
if (dio->result)
defer_completion = dio->defer_completion ||
(dio->op == REQ_OP_WRITE &&
dio->inode->i_mapping->nrpages);
if (defer_completion) {
INIT_WORK(&dio->complete_work, dio_aio_complete_work); INIT_WORK(&dio->complete_work, dio_aio_complete_work);
queue_work(dio->inode->i_sb->s_dio_done_wq, queue_work(dio->inode->i_sb->s_dio_done_wq,
&dio->complete_work); &dio->complete_work);
...@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, ...@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
* For AIO O_(D)SYNC writes we need to defer completions to a workqueue * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
* so that we can call ->fsync. * so that we can call ->fsync.
*/ */
if (dio->is_async && iov_iter_rw(iter) == WRITE && if (dio->is_async && iov_iter_rw(iter) == WRITE) {
((iocb->ki_filp->f_flags & O_DSYNC) || retval = 0;
IS_SYNC(iocb->ki_filp->f_mapping->host))) { if ((iocb->ki_filp->f_flags & O_DSYNC) ||
retval = dio_set_defer_completion(dio); IS_SYNC(iocb->ki_filp->f_mapping->host))
retval = dio_set_defer_completion(dio);
else if (!dio->inode->i_sb->s_dio_done_wq) {
/*
* In case of AIO write racing with buffered read we
* need to defer completion. We can't decide this now,
* however the workqueue needs to be initialized here.
*/
retval = sb_init_dio_done_wq(dio->inode->i_sb);
}
if (retval) { if (retval) {
/* /*
* We grab i_mutex only for reads so we don't have * We grab i_mutex only for reads so we don't have
......
...@@ -713,8 +713,24 @@ struct iomap_dio { ...@@ -713,8 +713,24 @@ struct iomap_dio {
static ssize_t iomap_dio_complete(struct iomap_dio *dio) static ssize_t iomap_dio_complete(struct iomap_dio *dio)
{ {
struct kiocb *iocb = dio->iocb; struct kiocb *iocb = dio->iocb;
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret; ssize_t ret;
/*
* Try again to invalidate clean pages which might have been cached by
* non-direct readahead, or faulted in by get_user_pages() if the source
* of the write was an mmap'ed region of the file we're writing. Either
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
*/
if (!dio->error &&
(dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
ret = invalidate_inode_pages2_range(inode->i_mapping,
iocb->ki_pos >> PAGE_SHIFT,
(iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
WARN_ON_ONCE(ret);
}
if (dio->end_io) { if (dio->end_io) {
ret = dio->end_io(iocb, ret = dio->end_io(iocb,
dio->error ? dio->error : dio->size, dio->error ? dio->error : dio->size,
...@@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, ...@@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
ret = iomap_dio_complete(dio); ret = iomap_dio_complete(dio);
/*
* Try again to invalidate clean pages which might have been cached by
* non-direct readahead, or faulted in by get_user_pages() if the source
* of the write was an mmap'ed region of the file we're writing. Either
* one is a pretty crazy thing to do, so we don't support it 100%. If
* this invalidation fails, tough, the write still worked...
*/
if (iov_iter_rw(iter) == WRITE) {
int err = invalidate_inode_pages2_range(mapping,
start >> PAGE_SHIFT, end >> PAGE_SHIFT);
WARN_ON_ONCE(err);
}
return ret; return ret;
out_free_dio: out_free_dio:
......
...@@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) ...@@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
* we're writing. Either one is a pretty crazy thing to do, * we're writing. Either one is a pretty crazy thing to do,
* so we don't support it 100%. If this invalidation * so we don't support it 100%. If this invalidation
* fails, tough, the write still worked... * fails, tough, the write still worked...
*
* Most of the time we do not need this since dio_complete() will do
* the invalidation for us. However there are some file systems that
* do not end up with dio_complete() being called, so let's not break
* them by removing it completely
*/ */
invalidate_inode_pages2_range(mapping, if (mapping->nrpages)
pos >> PAGE_SHIFT, end); invalidate_inode_pages2_range(mapping,
pos >> PAGE_SHIFT, end);
if (written > 0) { if (written > 0) {
pos += written; pos += written;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment