Commit 98e24746 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block

Pull io_uring buffered writes support from Jens Axboe:
 "This contains support for buffered writes, specifically for XFS. btrfs
  is in progress, will be coming in the next release.

  io_uring does support buffered writes on any file type, but since the
  buffered write path just always -EAGAIN (or -EOPNOTSUPP) any attempt
  to do so if IOCB_NOWAIT is set, any buffered write will effectively be
  handled by io-wq offload. This isn't very efficient, and we even have
  specific code in io-wq to serialize buffered writes to the same inode
  to avoid further inefficiencies with thread offload.

  This is particularly sad since most buffered writes don't block, they
  simply copy data to a page and dirty it. With this pull request, we
  can handle buffered writes a lot more effiently.

  If balance_dirty_pages() needs to block, we back off on writes as
  indicated.

  This improves buffered write support by 2-3x.

  Jan Kara helped with the mm bits for this, and Stefan handled the
  fs/iomap/xfs/io_uring parts of it"

* tag 'for-5.20/io_uring-buffered-writes-2022-07-29' of git://git.kernel.dk/linux-block:
  mm: honor FGP_NOWAIT for page cache page allocation
  xfs: Add async buffered write support
  xfs: Specify lockmode when calling xfs_ilock_for_iomap()
  io_uring: Add tracepoint for short writes
  io_uring: fix issue with io_write() not always undoing sb_start_write()
  io_uring: Add support for async buffered writes
  fs: Add async write file modification handling.
  fs: Split off inode_needs_update_time and __file_update_time
  fs: add __remove_file_privs() with flags parameter
  fs: add a FMODE_BUF_WASYNC flags for f_mode
  iomap: Return -EAGAIN from iomap_write_iter()
  iomap: Add async buffered write support
  iomap: Add flags parameter to iomap_page_create()
  mm: Add balance_dirty_pages_ratelimited_flags() function
  mm: Move updates of dirty_exceeded into one place
  mm: Move starting of background writeback into the main balancing loop
parents b349b118 0dd316ba
...@@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns, ...@@ -2010,67 +2010,57 @@ static int __remove_privs(struct user_namespace *mnt_userns,
return notify_change(mnt_userns, dentry, &newattrs, NULL); return notify_change(mnt_userns, dentry, &newattrs, NULL);
} }
/* static int __file_remove_privs(struct file *file, unsigned int flags)
* Remove special file priviledges (suid, capabilities) when file is written
* to or truncated.
*/
int file_remove_privs(struct file *file)
{ {
struct dentry *dentry = file_dentry(file); struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
int error;
int kill; int kill;
int error = 0;
/*
* Fast path for nothing security related.
* As well for non-regular files, e.g. blkdev inodes.
* For example, blkdev_write_iter() might get here
* trying to remove privs which it is not allowed to.
*/
if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode))
return 0; return 0;
kill = dentry_needs_remove_privs(dentry); kill = dentry_needs_remove_privs(dentry);
if (kill < 0) if (kill <= 0)
return kill; return kill;
if (kill)
if (flags & IOCB_NOWAIT)
return -EAGAIN;
error = __remove_privs(file_mnt_user_ns(file), dentry, kill); error = __remove_privs(file_mnt_user_ns(file), dentry, kill);
if (!error) if (!error)
inode_has_no_xattr(inode); inode_has_no_xattr(inode);
return error; return error;
} }
EXPORT_SYMBOL(file_remove_privs);
/** /**
* file_update_time - update mtime and ctime time * file_remove_privs - remove special file privileges (suid, capabilities)
* @file: file accessed * @file: file to remove privileges from
*
* When file is modified by a write or truncation ensure that special
* file privileges are removed.
* *
* Update the mtime and ctime members of an inode and mark the inode * Return: 0 on success, negative errno on failure.
* for writeback. Note that this function is meant exclusively for
* usage in the file write path of filesystems, and filesystems may
* choose to explicitly ignore update via this function with the
* S_NOCMTIME inode flag, e.g. for network filesystem where these
* timestamps are handled by the server. This can return an error for
* file systems who need to allocate space in order to update an inode.
*/ */
int file_remove_privs(struct file *file)
{
return __file_remove_privs(file, 0);
}
EXPORT_SYMBOL(file_remove_privs);
int file_update_time(struct file *file) static int inode_needs_update_time(struct inode *inode, struct timespec64 *now)
{ {
struct inode *inode = file_inode(file);
struct timespec64 now;
int sync_it = 0; int sync_it = 0;
int ret;
/* First try to exhaust all avenues to not sync */ /* First try to exhaust all avenues to not sync */
if (IS_NOCMTIME(inode)) if (IS_NOCMTIME(inode))
return 0; return 0;
now = current_time(inode); if (!timespec64_equal(&inode->i_mtime, now))
if (!timespec64_equal(&inode->i_mtime, &now))
sync_it = S_MTIME; sync_it = S_MTIME;
if (!timespec64_equal(&inode->i_ctime, &now)) if (!timespec64_equal(&inode->i_ctime, now))
sync_it |= S_CTIME; sync_it |= S_CTIME;
if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode)) if (IS_I_VERSION(inode) && inode_iversion_need_inc(inode))
...@@ -2079,37 +2069,127 @@ int file_update_time(struct file *file) ...@@ -2079,37 +2069,127 @@ int file_update_time(struct file *file)
if (!sync_it) if (!sync_it)
return 0; return 0;
/* Finally allowed to write? Takes lock. */ return sync_it;
if (__mnt_want_write_file(file)) }
return 0;
ret = inode_update_time(inode, &now, sync_it); static int __file_update_time(struct file *file, struct timespec64 *now,
int sync_mode)
{
int ret = 0;
struct inode *inode = file_inode(file);
/* try to update time settings */
if (!__mnt_want_write_file(file)) {
ret = inode_update_time(inode, now, sync_mode);
__mnt_drop_write_file(file); __mnt_drop_write_file(file);
}
return ret; return ret;
} }
/**
* file_update_time - update mtime and ctime time
* @file: file accessed
*
* Update the mtime and ctime members of an inode and mark the inode for
* writeback. Note that this function is meant exclusively for usage in
* the file write path of filesystems, and filesystems may choose to
* explicitly ignore updates via this function with the _NOCMTIME inode
* flag, e.g. for network filesystem where these imestamps are handled
* by the server. This can return an error for file systems who need to
* allocate space in order to update an inode.
*
* Return: 0 on success, negative errno on failure.
*/
int file_update_time(struct file *file)
{
int ret;
struct inode *inode = file_inode(file);
struct timespec64 now = current_time(inode);
ret = inode_needs_update_time(inode, &now);
if (ret <= 0)
return ret;
return __file_update_time(file, &now, ret);
}
EXPORT_SYMBOL(file_update_time); EXPORT_SYMBOL(file_update_time);
/* Caller must hold the file's inode lock */ /**
int file_modified(struct file *file) * file_modified_flags - handle mandated vfs changes when modifying a file
* @file: file that was modified
* @flags: kiocb flags
*
* When file has been modified ensure that special
* file privileges are removed and time settings are updated.
*
* If IOCB_NOWAIT is set, special file privileges will not be removed and
* time settings will not be updated. It will return -EAGAIN.
*
* Context: Caller must hold the file's inode lock.
*
* Return: 0 on success, negative errno on failure.
*/
static int file_modified_flags(struct file *file, int flags)
{ {
int err; int ret;
struct inode *inode = file_inode(file);
struct timespec64 now = current_time(inode);
/* /*
* Clear the security bits if the process is not being run by root. * Clear the security bits if the process is not being run by root.
* This keeps people from modifying setuid and setgid binaries. * This keeps people from modifying setuid and setgid binaries.
*/ */
err = file_remove_privs(file); ret = __file_remove_privs(file, flags);
if (err) if (ret)
return err; return ret;
if (unlikely(file->f_mode & FMODE_NOCMTIME)) if (unlikely(file->f_mode & FMODE_NOCMTIME))
return 0; return 0;
return file_update_time(file); ret = inode_needs_update_time(inode, &now);
if (ret <= 0)
return ret;
if (flags & IOCB_NOWAIT)
return -EAGAIN;
return __file_update_time(file, &now, ret);
}
/**
* file_modified - handle mandated vfs changes when modifying a file
* @file: file that was modified
*
* When file has been modified ensure that special
* file privileges are removed and time settings are updated.
*
* Context: Caller must hold the file's inode lock.
*
* Return: 0 on success, negative errno on failure.
*/
int file_modified(struct file *file)
{
return file_modified_flags(file, 0);
} }
EXPORT_SYMBOL(file_modified); EXPORT_SYMBOL(file_modified);
/**
* kiocb_modified - handle mandated vfs changes when modifying a file
* @iocb: iocb that was modified
*
* When file has been modified ensure that special
* file privileges are removed and time settings are updated.
*
* Context: Caller must hold the file's inode lock.
*
* Return: 0 on success, negative errno on failure.
*/
int kiocb_modified(struct kiocb *iocb)
{
return file_modified_flags(iocb->ki_filp, iocb->ki_flags);
}
EXPORT_SYMBOL_GPL(kiocb_modified);
int inode_needs_sync(struct inode *inode) int inode_needs_sync(struct inode *inode)
{ {
if (IS_SYNC(inode)) if (IS_SYNC(inode))
......
...@@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio) ...@@ -44,20 +44,28 @@ static inline struct iomap_page *to_iomap_page(struct folio *folio)
static struct bio_set iomap_ioend_bioset; static struct bio_set iomap_ioend_bioset;
static struct iomap_page * static struct iomap_page *
iomap_page_create(struct inode *inode, struct folio *folio) iomap_page_create(struct inode *inode, struct folio *folio, unsigned int flags)
{ {
struct iomap_page *iop = to_iomap_page(folio); struct iomap_page *iop = to_iomap_page(folio);
unsigned int nr_blocks = i_blocks_per_folio(inode, folio); unsigned int nr_blocks = i_blocks_per_folio(inode, folio);
gfp_t gfp;
if (iop || nr_blocks <= 1) if (iop || nr_blocks <= 1)
return iop; return iop;
if (flags & IOMAP_NOWAIT)
gfp = GFP_NOWAIT;
else
gfp = GFP_NOFS | __GFP_NOFAIL;
iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)), iop = kzalloc(struct_size(iop, uptodate, BITS_TO_LONGS(nr_blocks)),
GFP_NOFS | __GFP_NOFAIL); gfp);
if (iop) {
spin_lock_init(&iop->uptodate_lock); spin_lock_init(&iop->uptodate_lock);
if (folio_test_uptodate(folio)) if (folio_test_uptodate(folio))
bitmap_fill(iop->uptodate, nr_blocks); bitmap_fill(iop->uptodate, nr_blocks);
folio_attach_private(folio, iop); folio_attach_private(folio, iop);
}
return iop; return iop;
} }
...@@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, ...@@ -226,7 +234,7 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
if (WARN_ON_ONCE(size > iomap->length)) if (WARN_ON_ONCE(size > iomap->length))
return -EIO; return -EIO;
if (offset > 0) if (offset > 0)
iop = iomap_page_create(iter->inode, folio); iop = iomap_page_create(iter->inode, folio, iter->flags);
else else
iop = to_iomap_page(folio); iop = to_iomap_page(folio);
...@@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, ...@@ -264,7 +272,7 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter,
return iomap_read_inline_data(iter, folio); return iomap_read_inline_data(iter, folio);
/* zero post-eof blocks as the page may be mapped */ /* zero post-eof blocks as the page may be mapped */
iop = iomap_page_create(iter->inode, folio); iop = iomap_page_create(iter->inode, folio, iter->flags);
iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen); iomap_adjust_read_range(iter->inode, folio, &pos, length, &poff, &plen);
if (plen == 0) if (plen == 0)
goto done; goto done;
...@@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -547,10 +555,11 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
size_t len, struct folio *folio) size_t len, struct folio *folio)
{ {
const struct iomap *srcmap = iomap_iter_srcmap(iter); const struct iomap *srcmap = iomap_iter_srcmap(iter);
struct iomap_page *iop = iomap_page_create(iter->inode, folio); struct iomap_page *iop;
loff_t block_size = i_blocksize(iter->inode); loff_t block_size = i_blocksize(iter->inode);
loff_t block_start = round_down(pos, block_size); loff_t block_start = round_down(pos, block_size);
loff_t block_end = round_up(pos + len, block_size); loff_t block_end = round_up(pos + len, block_size);
unsigned int nr_blocks = i_blocks_per_folio(iter->inode, folio);
size_t from = offset_in_folio(folio, pos), to = from + len; size_t from = offset_in_folio(folio, pos), to = from + len;
size_t poff, plen; size_t poff, plen;
...@@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -558,6 +567,10 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return 0; return 0;
folio_clear_error(folio); folio_clear_error(folio);
iop = iomap_page_create(iter->inode, folio, iter->flags);
if ((iter->flags & IOMAP_NOWAIT) && !iop && nr_blocks > 1)
return -EAGAIN;
do { do {
iomap_adjust_read_range(iter->inode, folio, &block_start, iomap_adjust_read_range(iter->inode, folio, &block_start,
block_end - block_start, &poff, &plen); block_end - block_start, &poff, &plen);
...@@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -574,7 +587,12 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
return -EIO; return -EIO;
folio_zero_segments(folio, poff, from, to, poff + plen); folio_zero_segments(folio, poff, from, to, poff + plen);
} else { } else {
int status = iomap_read_folio_sync(block_start, folio, int status;
if (iter->flags & IOMAP_NOWAIT)
return -EAGAIN;
status = iomap_read_folio_sync(block_start, folio,
poff, plen, srcmap); poff, plen, srcmap);
if (status) if (status)
return status; return status;
...@@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -603,6 +621,9 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS; unsigned fgp = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE | FGP_NOFS;
int status = 0; int status = 0;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length);
if (srcmap != &iter->iomap) if (srcmap != &iter->iomap)
BUG_ON(pos + len > srcmap->offset + srcmap->length); BUG_ON(pos + len > srcmap->offset + srcmap->length);
...@@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos, ...@@ -622,7 +643,7 @@ static int iomap_write_begin(const struct iomap_iter *iter, loff_t pos,
folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping)); fgp, mapping_gfp_mask(iter->inode->i_mapping));
if (!folio) { if (!folio) {
status = -ENOMEM; status = (iter->flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOMEM;
goto out_no_page; goto out_no_page;
} }
if (pos + len > folio_pos(folio) + folio_size(folio)) if (pos + len > folio_pos(folio) + folio_size(folio))
...@@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) ...@@ -740,6 +761,8 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
loff_t pos = iter->pos; loff_t pos = iter->pos;
ssize_t written = 0; ssize_t written = 0;
long status = 0; long status = 0;
struct address_space *mapping = iter->inode->i_mapping;
unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0;
do { do {
struct folio *folio; struct folio *folio;
...@@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) ...@@ -752,6 +775,11 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
bytes = min_t(unsigned long, PAGE_SIZE - offset, bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i)); iov_iter_count(i));
again: again:
status = balance_dirty_pages_ratelimited_flags(mapping,
bdp_flags);
if (unlikely(status))
break;
if (bytes > length) if (bytes > length)
bytes = length; bytes = length;
...@@ -760,6 +788,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) ...@@ -760,6 +788,10 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
* Otherwise there's a nasty deadlock on copying from the * Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked * same page as we're writing to, without it being marked
* up-to-date. * up-to-date.
*
* For async buffered writes the assumption is that the user
* page has already been faulted in. This can be optimized by
* faulting the user page.
*/ */
if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) { if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT; status = -EFAULT;
...@@ -771,7 +803,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) ...@@ -771,7 +803,7 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
break; break;
page = folio_file_page(folio, pos >> PAGE_SHIFT); page = folio_file_page(folio, pos >> PAGE_SHIFT);
if (mapping_writably_mapped(iter->inode->i_mapping)) if (mapping_writably_mapped(mapping))
flush_dcache_page(page); flush_dcache_page(page);
copied = copy_page_from_iter_atomic(page, offset, bytes, i); copied = copy_page_from_iter_atomic(page, offset, bytes, i);
...@@ -796,10 +828,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) ...@@ -796,10 +828,12 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i)
pos += status; pos += status;
written += status; written += status;
length -= status; length -= status;
balance_dirty_pages_ratelimited(iter->inode->i_mapping);
} while (iov_iter_count(i) && length); } while (iov_iter_count(i) && length);
if (status == -EAGAIN) {
iov_iter_revert(i, written);
return -EAGAIN;
}
return written ? written : status; return written ? written : status;
} }
...@@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, ...@@ -815,6 +849,9 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i,
}; };
int ret; int ret;
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
while ((ret = iomap_iter(&iter, ops)) > 0) while ((ret = iomap_iter(&iter, ops)) > 0)
iter.processed = iomap_write_iter(&iter, i); iter.processed = iomap_write_iter(&iter, i);
if (iter.pos == iocb->ki_pos) if (iter.pos == iocb->ki_pos)
...@@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc, ...@@ -1329,7 +1366,7 @@ iomap_writepage_map(struct iomap_writepage_ctx *wpc,
struct writeback_control *wbc, struct inode *inode, struct writeback_control *wbc, struct inode *inode,
struct folio *folio, u64 end_pos) struct folio *folio, u64 end_pos)
{ {
struct iomap_page *iop = iomap_page_create(inode, folio); struct iomap_page *iop = iomap_page_create(inode, folio, 0);
struct iomap_ioend *ioend, *next; struct iomap_ioend *ioend, *next;
unsigned len = i_blocksize(inode); unsigned len = i_blocksize(inode);
unsigned nblocks = i_blocks_per_folio(inode, folio); unsigned nblocks = i_blocks_per_folio(inode, folio);
......
...@@ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count) ...@@ -1663,7 +1663,9 @@ int generic_write_checks_count(struct kiocb *iocb, loff_t *count)
if (iocb->ki_flags & IOCB_APPEND) if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode); iocb->ki_pos = i_size_read(inode);
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT)) if ((iocb->ki_flags & IOCB_NOWAIT) &&
!((iocb->ki_flags & IOCB_DIRECT) ||
(file->f_mode & FMODE_BUF_WASYNC)))
return -EINVAL; return -EINVAL;
return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count); return generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, count);
......
...@@ -410,7 +410,7 @@ xfs_file_write_checks( ...@@ -410,7 +410,7 @@ xfs_file_write_checks(
spin_unlock(&ip->i_flags_lock); spin_unlock(&ip->i_flags_lock);
out: out:
return file_modified(file); return kiocb_modified(iocb);
} }
static int static int
...@@ -700,12 +700,11 @@ xfs_file_buffered_write( ...@@ -700,12 +700,11 @@ xfs_file_buffered_write(
bool cleared_space = false; bool cleared_space = false;
unsigned int iolock; unsigned int iolock;
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
write_retry: write_retry:
iolock = XFS_IOLOCK_EXCL; iolock = XFS_IOLOCK_EXCL;
xfs_ilock(ip, iolock); ret = xfs_ilock_iocb(iocb, iolock);
if (ret)
return ret;
ret = xfs_file_write_checks(iocb, from, &iolock); ret = xfs_file_write_checks(iocb, from, &iolock);
if (ret) if (ret)
...@@ -1165,7 +1164,7 @@ xfs_file_open( ...@@ -1165,7 +1164,7 @@ xfs_file_open(
{ {
if (xfs_is_shutdown(XFS_M(inode->i_sb))) if (xfs_is_shutdown(XFS_M(inode->i_sb)))
return -EIO; return -EIO;
file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
return generic_file_open(inode, file); return generic_file_open(inode, file);
} }
......
...@@ -664,7 +664,7 @@ xfs_ilock_for_iomap( ...@@ -664,7 +664,7 @@ xfs_ilock_for_iomap(
unsigned flags, unsigned flags,
unsigned *lockmode) unsigned *lockmode)
{ {
unsigned mode = XFS_ILOCK_SHARED; unsigned int mode = *lockmode;
bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO); bool is_write = flags & (IOMAP_WRITE | IOMAP_ZERO);
/* /*
...@@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin( ...@@ -742,7 +742,7 @@ xfs_direct_write_iomap_begin(
int nimaps = 1, error = 0; int nimaps = 1, error = 0;
bool shared = false; bool shared = false;
u16 iomap_flags = 0; u16 iomap_flags = 0;
unsigned lockmode; unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO)); ASSERT(flags & (IOMAP_WRITE | IOMAP_ZERO));
...@@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin( ...@@ -886,6 +886,7 @@ xfs_buffered_write_iomap_begin(
bool eof = false, cow_eof = false, shared = false; bool eof = false, cow_eof = false, shared = false;
int allocfork = XFS_DATA_FORK; int allocfork = XFS_DATA_FORK;
int error = 0; int error = 0;
unsigned int lockmode = XFS_ILOCK_EXCL;
if (xfs_is_shutdown(mp)) if (xfs_is_shutdown(mp))
return -EIO; return -EIO;
...@@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin( ...@@ -897,7 +898,9 @@ xfs_buffered_write_iomap_begin(
ASSERT(!XFS_IS_REALTIME_INODE(ip)); ASSERT(!XFS_IS_REALTIME_INODE(ip));
xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_ilock_for_iomap(ip, flags, &lockmode);
if (error)
return error;
if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) ||
XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) {
...@@ -1172,7 +1175,7 @@ xfs_read_iomap_begin( ...@@ -1172,7 +1175,7 @@ xfs_read_iomap_begin(
xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length);
int nimaps = 1, error = 0; int nimaps = 1, error = 0;
bool shared = false; bool shared = false;
unsigned lockmode; unsigned int lockmode = XFS_ILOCK_SHARED;
ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO))); ASSERT(!(flags & (IOMAP_WRITE | IOMAP_ZERO)));
......
...@@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ...@@ -180,6 +180,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
/* File supports async buffered reads */ /* File supports async buffered reads */
#define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000)
/* File supports async nowait buffered writes */
#define FMODE_BUF_WASYNC ((__force fmode_t)0x80000000)
/* /*
* Attribute flags. These should be or-ed together to figure out what * Attribute flags. These should be or-ed together to figure out what
* has been changed! * has been changed!
...@@ -2515,6 +2518,7 @@ static inline void file_accessed(struct file *file) ...@@ -2515,6 +2518,7 @@ static inline void file_accessed(struct file *file)
} }
extern int file_modified(struct file *file); extern int file_modified(struct file *file);
int kiocb_modified(struct kiocb *iocb);
int sync_inode_metadata(struct inode *inode, int wait); int sync_inode_metadata(struct inode *inode, int wait);
......
...@@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); ...@@ -364,7 +364,14 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
void wb_update_bandwidth(struct bdi_writeback *wb); void wb_update_bandwidth(struct bdi_writeback *wb);
/* Invoke balance dirty pages in async mode. */
#define BDP_ASYNC 0x0001
void balance_dirty_pages_ratelimited(struct address_space *mapping); void balance_dirty_pages_ratelimited(struct address_space *mapping);
int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
unsigned int flags);
bool wb_over_bg_thresh(struct bdi_writeback *wb); bool wb_over_bg_thresh(struct bdi_writeback *wb);
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
......
...@@ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run, ...@@ -630,6 +630,31 @@ TRACE_EVENT(io_uring_task_work_run,
__entry->tctx, __entry->count, __entry->loops) __entry->tctx, __entry->count, __entry->loops)
); );
TRACE_EVENT(io_uring_short_write,
TP_PROTO(void *ctx, u64 fpos, u64 wanted, u64 got),
TP_ARGS(ctx, fpos, wanted, got),
TP_STRUCT__entry(
__field(void *, ctx)
__field(u64, fpos)
__field(u64, wanted)
__field(u64, got)
),
TP_fast_assign(
__entry->ctx = ctx;
__entry->fpos = fpos;
__entry->wanted = wanted;
__entry->got = got;
),
TP_printk("ring %p, fpos %lld, wanted %lld, got %lld",
__entry->ctx, __entry->fpos,
__entry->wanted, __entry->got)
);
#endif /* _TRACE_IO_URING_H */ #endif /* _TRACE_IO_URING_H */
/* This part must be outside protection */ /* This part must be outside protection */
......
...@@ -641,7 +641,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter) ...@@ -641,7 +641,7 @@ static inline int io_iter_do_read(struct io_rw *rw, struct iov_iter *iter)
return -EINVAL; return -EINVAL;
} }
static bool need_read_all(struct io_kiocb *req) static bool need_complete_io(struct io_kiocb *req)
{ {
return req->flags & REQ_F_ISREG || return req->flags & REQ_F_ISREG ||
S_ISBLK(file_inode(req->file)->i_mode); S_ISBLK(file_inode(req->file)->i_mode);
...@@ -775,7 +775,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags) ...@@ -775,7 +775,7 @@ int io_read(struct io_kiocb *req, unsigned int issue_flags)
kfree(iovec); kfree(iovec);
return IOU_ISSUE_SKIP_COMPLETE; return IOU_ISSUE_SKIP_COMPLETE;
} else if (ret == req->cqe.res || ret <= 0 || !force_nonblock || } else if (ret == req->cqe.res || ret <= 0 || !force_nonblock ||
(req->flags & REQ_F_NOWAIT) || !need_read_all(req)) { (req->flags & REQ_F_NOWAIT) || !need_complete_io(req)) {
/* read all, failed, already did sync or don't want to retry */ /* read all, failed, already did sync or don't want to retry */
goto done; goto done;
} }
...@@ -870,8 +870,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ...@@ -870,8 +870,9 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!io_file_supports_nowait(req))) if (unlikely(!io_file_supports_nowait(req)))
goto copy_iov; goto copy_iov;
/* file path doesn't support NOWAIT for non-direct_IO */ /* File path supports NOWAIT for non-direct_IO only for block devices. */
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT) && if (!(kiocb->ki_flags & IOCB_DIRECT) &&
!(kiocb->ki_filp->f_mode & FMODE_BUF_WASYNC) &&
(req->flags & REQ_F_ISREG)) (req->flags & REQ_F_ISREG))
goto copy_iov; goto copy_iov;
...@@ -928,13 +929,41 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags) ...@@ -928,13 +929,41 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
/* IOPOLL retry should happen for io-wq threads */ /* IOPOLL retry should happen for io-wq threads */
if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL)) if (ret2 == -EAGAIN && (req->ctx->flags & IORING_SETUP_IOPOLL))
goto copy_iov; goto copy_iov;
if (ret2 != req->cqe.res && ret2 >= 0 && need_complete_io(req)) {
struct io_async_rw *rw;
trace_io_uring_short_write(req->ctx, kiocb->ki_pos - ret2,
req->cqe.res, ret2);
/* This is a partial write. The file pos has already been
* updated, setup the async struct to complete the request
* in the worker. Also update bytes_done to account for
* the bytes already written.
*/
iov_iter_save_state(&s->iter, &s->iter_state);
ret = io_setup_async_rw(req, iovec, s, true);
rw = req->async_data;
if (rw)
rw->bytes_done += ret2;
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
return ret ? ret : -EAGAIN;
}
done: done:
ret = kiocb_done(req, ret2, issue_flags); ret = kiocb_done(req, ret2, issue_flags);
} else { } else {
copy_iov: copy_iov:
iov_iter_restore(&s->iter, &s->iter_state); iov_iter_restore(&s->iter, &s->iter_state);
ret = io_setup_async_rw(req, iovec, s, false); ret = io_setup_async_rw(req, iovec, s, false);
return ret ?: -EAGAIN; if (!ret) {
if (kiocb->ki_flags & IOCB_WRITE)
kiocb_end_write(req);
return -EAGAIN;
}
return ret;
} }
/* it's reportedly faster than delegating the null check to kfree() */ /* it's reportedly faster than delegating the null check to kfree() */
if (iovec) if (iovec)
......
...@@ -1988,6 +1988,10 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, ...@@ -1988,6 +1988,10 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
gfp |= __GFP_WRITE; gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS) if (fgp_flags & FGP_NOFS)
gfp &= ~__GFP_FS; gfp &= ~__GFP_FS;
if (fgp_flags & FGP_NOWAIT) {
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
folio = filemap_alloc_folio(gfp, 0); folio = filemap_alloc_folio(gfp, 0);
if (!folio) if (!folio)
......
...@@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) ...@@ -1554,8 +1554,8 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
* If we're over `background_thresh' then the writeback threads are woken to * If we're over `background_thresh' then the writeback threads are woken to
* perform some writeout. * perform some writeout.
*/ */
static void balance_dirty_pages(struct bdi_writeback *wb, static int balance_dirty_pages(struct bdi_writeback *wb,
unsigned long pages_dirtied) unsigned long pages_dirtied, unsigned int flags)
{ {
struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
...@@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1575,6 +1575,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct backing_dev_info *bdi = wb->bdi; struct backing_dev_info *bdi = wb->bdi;
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT; bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies; unsigned long start_time = jiffies;
int ret = 0;
for (;;) { for (;;) {
unsigned long now = jiffies; unsigned long now = jiffies;
...@@ -1627,6 +1628,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1627,6 +1628,19 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
} }
} }
/*
* In laptop mode, we wait until hitting the higher threshold
* before starting background writeout, and then write out all
* the way down to the lower threshold. So slow writers cause
* minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (!laptop_mode && nr_reclaimable > gdtc->bg_thresh &&
!writeback_in_progress(wb))
wb_start_background_writeback(wb);
/* /*
* Throttle it only when the background writeback cannot * Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts * catch-up. This avoids (excessively) small writeouts
...@@ -1657,6 +1671,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1657,6 +1671,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
break; break;
} }
/* Start writeback even when in laptop mode */
if (unlikely(!writeback_in_progress(wb))) if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb); wb_start_background_writeback(wb);
...@@ -1715,8 +1730,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1715,8 +1730,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
sdtc = mdtc; sdtc = mdtc;
} }
if (dirty_exceeded && !wb->dirty_exceeded) if (dirty_exceeded != wb->dirty_exceeded)
wb->dirty_exceeded = 1; wb->dirty_exceeded = dirty_exceeded;
if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) +
BANDWIDTH_INTERVAL)) BANDWIDTH_INTERVAL))
...@@ -1789,6 +1804,10 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1789,6 +1804,10 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
period, period,
pause, pause,
start_time); start_time);
if (flags & BDP_ASYNC) {
ret = -EAGAIN;
break;
}
__set_current_state(TASK_KILLABLE); __set_current_state(TASK_KILLABLE);
wb->dirty_sleep = now; wb->dirty_sleep = now;
io_schedule_timeout(pause); io_schedule_timeout(pause);
...@@ -1820,26 +1839,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1820,26 +1839,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (fatal_signal_pending(current)) if (fatal_signal_pending(current))
break; break;
} }
return ret;
if (!dirty_exceeded && wb->dirty_exceeded)
wb->dirty_exceeded = 0;
if (writeback_in_progress(wb))
return;
/*
* In laptop mode, we wait until hitting the higher threshold before
* starting background writeout, and then write out all the way down
* to the lower threshold. So slow writers cause minimal disk activity.
*
* In normal mode, we start background writeout at the lower
* background_thresh, to keep the amount of dirty memory low.
*/
if (laptop_mode)
return;
if (nr_reclaimable > gdtc->bg_thresh)
wb_start_background_writeback(wb);
} }
static DEFINE_PER_CPU(int, bdp_ratelimits); static DEFINE_PER_CPU(int, bdp_ratelimits);
...@@ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); ...@@ -1861,27 +1861,34 @@ static DEFINE_PER_CPU(int, bdp_ratelimits);
DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
/** /**
* balance_dirty_pages_ratelimited - balance dirty memory state * balance_dirty_pages_ratelimited_flags - Balance dirty memory state.
* @mapping: address_space which was dirtied * @mapping: address_space which was dirtied.
* @flags: BDP flags.
* *
* Processes which are dirtying memory should call in here once for each page * Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's * which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed. * dirty state and will initiate writeback if needed.
* *
* Once we're over the dirty memory limit we decrease the ratelimiting * See balance_dirty_pages_ratelimited() for details.
* by a lot, to prevent individual processes from overshooting the limit *
* by (ratelimit_pages) each. * Return: If @flags contains BDP_ASYNC, it may return -EAGAIN to
* indicate that memory is out of balance and the caller must wait
* for I/O to complete. Otherwise, it will return 0 to indicate
* that either memory was already in balance, or it was able to sleep
* until the amount of dirty memory returned to balance.
*/ */
void balance_dirty_pages_ratelimited(struct address_space *mapping) int balance_dirty_pages_ratelimited_flags(struct address_space *mapping,
unsigned int flags)
{ {
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode); struct backing_dev_info *bdi = inode_to_bdi(inode);
struct bdi_writeback *wb = NULL; struct bdi_writeback *wb = NULL;
int ratelimit; int ratelimit;
int ret = 0;
int *p; int *p;
if (!(bdi->capabilities & BDI_CAP_WRITEBACK)) if (!(bdi->capabilities & BDI_CAP_WRITEBACK))
return; return ret;
if (inode_cgwb_enabled(inode)) if (inode_cgwb_enabled(inode))
wb = wb_get_create_current(bdi, GFP_KERNEL); wb = wb_get_create_current(bdi, GFP_KERNEL);
...@@ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) ...@@ -1921,9 +1928,27 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
preempt_enable(); preempt_enable();
if (unlikely(current->nr_dirtied >= ratelimit)) if (unlikely(current->nr_dirtied >= ratelimit))
balance_dirty_pages(wb, current->nr_dirtied); ret = balance_dirty_pages(wb, current->nr_dirtied, flags);
wb_put(wb); wb_put(wb);
return ret;
}
/**
* balance_dirty_pages_ratelimited - balance dirty memory state.
* @mapping: address_space which was dirtied.
*
* Processes which are dirtying memory should call in here once for each page
* which was newly dirtied. The function will periodically check the system's
* dirty state and will initiate writeback if needed.
*
* Once we're over the dirty memory limit we decrease the ratelimiting
* by a lot, to prevent individual processes from overshooting the limit
* by (ratelimit_pages) each.
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
balance_dirty_pages_ratelimited_flags(mapping, 0);
} }
EXPORT_SYMBOL(balance_dirty_pages_ratelimited); EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment