Commit adfc3ded authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux

Pull io_uring async discard support from Jens Axboe:
 "Sitting on top of both the 6.12 block and io_uring core branches,
  here's support for async discard through io_uring.

  This allows applications to issue async discards, rather than rely on
  the blocking sync ioctl discards we already have. The sync support is
  difficult to use outside of idle/cleanup periods.

  On a real (but slow) device, testing shows the following results when
  compared to sync discard:

	qd64 sync discard: 21K IOPS, lat avg 3 msec (max 21 msec)
	qd64 async discard: 76K IOPS, lat avg 845 usec (max 2.2 msec)

	qd64 sync discard: 14K IOPS, lat avg 5 msec (max 25 msec)
	qd64 async discard: 56K IOPS, lat avg 1153 usec (max 3.6 msec)

  and synthetic null_blk testing with the same queue depth and block
  size settings as above shows:

	Type    Trim size       IOPS    Lat avg (usec)  Lat Max (usec)
	==============================================================
	sync    4k               144K       444            20314
	async   4k              1353K        47              595
	sync    1M                56K      1136            21031
	async   1M                94K       680              760"

* tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux:
  block: implement async io_uring discard cmd
  block: introduce blk_validate_byte_range()
  filemap: introduce filemap_invalidate_pages
  io_uring/cmd: give inline space in request to cmds
  io_uring/cmd: expose iowq to cmds
parents 26bb0d3f 50c52250
...@@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file); ...@@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
loff_t lstart, loff_t lend); loff_t lstart, loff_t lend);
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops; extern const struct address_space_operations def_blk_aops;
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/iomap.h> #include <linux/iomap.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/io_uring/cmd.h>
#include "blk.h" #include "blk.h"
static inline struct inode *bdev_file_inode(struct file *file) static inline struct inode *bdev_file_inode(struct file *file)
...@@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = { ...@@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = {
.splice_read = filemap_splice_read, .splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write, .splice_write = iter_file_splice_write,
.fallocate = blkdev_fallocate, .fallocate = blkdev_fallocate,
.uring_cmd = blkdev_uring_cmd,
.fop_flags = FOP_BUFFER_RASYNC, .fop_flags = FOP_BUFFER_RASYNC,
}; };
......
...@@ -11,6 +11,9 @@ ...@@ -11,6 +11,9 @@
#include <linux/blktrace_api.h> #include <linux/blktrace_api.h>
#include <linux/pr.h> #include <linux/pr.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/pagemap.h>
#include <linux/io_uring/cmd.h>
#include <uapi/linux/blkdev.h>
#include "blk.h" #include "blk.h"
static int blkpg_do_ioctl(struct block_device *bdev, static int blkpg_do_ioctl(struct block_device *bdev,
...@@ -92,41 +95,54 @@ static int compat_blkpg_ioctl(struct block_device *bdev, ...@@ -92,41 +95,54 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
} }
#endif #endif
/*
* Check that [start, start + len) is a valid range from the block device's
* perspective, including verifying that it can be correctly translated into
* logical block addresses.
*/
static int blk_validate_byte_range(struct block_device *bdev,
uint64_t start, uint64_t len)
{
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
uint64_t end;
if ((start | len) & bs_mask)
return -EINVAL;
if (!len)
return -EINVAL;
if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
return -EINVAL;
return 0;
}
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
unsigned long arg) unsigned long arg)
{ {
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; uint64_t range[2], start, len;
uint64_t range[2], start, len, end;
struct bio *prev = NULL, *bio; struct bio *prev = NULL, *bio;
sector_t sector, nr_sects; sector_t sector, nr_sects;
struct blk_plug plug; struct blk_plug plug;
int err; int err;
if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
return -EPERM;
if (copy_from_user(range, (void __user *)arg, sizeof(range))) if (copy_from_user(range, (void __user *)arg, sizeof(range)))
return -EFAULT; return -EFAULT;
start = range[0]; start = range[0];
len = range[1]; len = range[1];
if (!len) if (!bdev_max_discard_sectors(bdev))
return -EINVAL; return -EOPNOTSUPP;
if ((start | len) & bs_mask)
return -EINVAL;
if (check_add_overflow(start, len, &end) || if (!(mode & BLK_OPEN_WRITE))
end > bdev_nr_bytes(bdev)) return -EBADF;
return -EINVAL; if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;
filemap_invalidate_lock(bdev->bd_mapping); filemap_invalidate_lock(bdev->bd_mapping);
err = truncate_bdev_range(bdev, mode, start, end - 1); err = truncate_bdev_range(bdev, mode, start, start + len - 1);
if (err) if (err)
goto fail; goto fail;
...@@ -735,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) ...@@ -735,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return ret; return ret;
} }
#endif #endif
struct blk_iou_cmd {
int res;
bool nowait;
};
static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
if (bic->res == -EAGAIN && bic->nowait)
io_uring_cmd_issue_blocking(cmd);
else
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
}
static void bio_cmd_bio_end_io(struct bio *bio)
{
struct io_uring_cmd *cmd = bio->bi_private;
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
if (unlikely(bio->bi_status) && !bic->res)
bic->res = blk_status_to_errno(bio->bi_status);
io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
bio_put(bio);
}
static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
struct block_device *bdev,
uint64_t start, uint64_t len, bool nowait)
{
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
sector_t sector = start >> SECTOR_SHIFT;
sector_t nr_sects = len >> SECTOR_SHIFT;
struct bio *prev = NULL, *bio;
int err;
if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
return -EBADF;
if (bdev_read_only(bdev))
return -EPERM;
err = blk_validate_byte_range(bdev, start, len);
if (err)
return err;
err = filemap_invalidate_pages(bdev->bd_mapping, start,
start + len - 1, nowait);
if (err)
return err;
while (true) {
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
if (!bio)
break;
if (nowait) {
/*
* Don't allow multi-bio non-blocking submissions as
* subsequent bios may fail but we won't get a direct
* indication of that. Normally, the caller should
* retry from a blocking context.
*/
if (unlikely(nr_sects)) {
bio_put(bio);
return -EAGAIN;
}
bio->bi_opf |= REQ_NOWAIT;
}
prev = bio_chain_and_submit(prev, bio);
}
if (unlikely(!prev))
return -EAGAIN;
if (unlikely(nr_sects))
bic->res = -EAGAIN;
prev->bi_private = cmd;
prev->bi_end_io = bio_cmd_bio_end_io;
submit_bio(prev);
return -EIOCBQUEUED;
}
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
{
struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
const struct io_uring_sqe *sqe = cmd->sqe;
u32 cmd_op = cmd->cmd_op;
uint64_t start, len;
if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
sqe->rw_flags || sqe->file_index))
return -EINVAL;
bic->res = 0;
bic->nowait = issue_flags & IO_URING_F_NONBLOCK;
start = READ_ONCE(sqe->addr);
len = READ_ONCE(sqe->addr3);
switch (cmd_op) {
case BLOCK_URING_CMD_DISCARD:
return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
}
return -EINVAL;
}
...@@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe) ...@@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
return sqe->cmd; return sqe->cmd;
} }
static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
{
BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu));
}
#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \
io_uring_cmd_private_sz_check(sizeof(pdu_type)), \
((pdu_type *)&(cmd)->pdu) \
)
#if defined(CONFIG_IO_URING) #if defined(CONFIG_IO_URING)
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd); struct iov_iter *iter, void *ioucmd);
...@@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd, ...@@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags); unsigned int issue_flags);
/* Execute the request from a blocking context */
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
#else #else
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
struct iov_iter *iter, void *ioucmd) struct iov_iter *iter, void *ioucmd)
...@@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd, ...@@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
unsigned int issue_flags) unsigned int issue_flags)
{ {
} }
static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
}
#endif #endif
/* /*
......
...@@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, ...@@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end); pgoff_t start, pgoff_t end);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count); int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count); void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait);
int write_inode_now(struct inode *, int sync); int write_inode_now(struct inode *, int sync);
int filemap_fdatawrite(struct address_space *); int filemap_fdatawrite(struct address_space *);
......
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
#ifndef _UAPI_LINUX_BLKDEV_H
#define _UAPI_LINUX_BLKDEV_H
#include <linux/ioctl.h>
#include <linux/types.h>
/*
* io_uring block file commands, see IORING_OP_URING_CMD.
* It's a different number space from ioctl(), reuse the block's code 0x12.
*/
#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
#endif
...@@ -533,6 +533,17 @@ static void io_queue_iowq(struct io_kiocb *req) ...@@ -533,6 +533,17 @@ static void io_queue_iowq(struct io_kiocb *req)
io_queue_linked_timeout(link); io_queue_linked_timeout(link);
} }
static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts)
{
io_queue_iowq(req);
}
void io_req_queue_iowq(struct io_kiocb *req)
{
req->io_task_work.func = io_req_queue_iowq_tw;
io_req_task_work_add(req);
}
static __cold void io_queue_deferred(struct io_ring_ctx *ctx) static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
{ {
while (!list_empty(&ctx->defer_list)) { while (!list_empty(&ctx->defer_list)) {
......
...@@ -94,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task, ...@@ -94,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task,
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file, int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
int start, int end); int start, int end);
void io_req_queue_iowq(struct io_kiocb *req);
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts); int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr); int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
......
...@@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw, ...@@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
} }
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed); EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
{
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
io_req_queue_iowq(req);
}
static inline int io_uring_cmd_getsockopt(struct socket *sock, static inline int io_uring_cmd_getsockopt(struct socket *sock,
struct io_uring_cmd *cmd, struct io_uring_cmd *cmd,
unsigned int issue_flags) unsigned int issue_flags)
......
...@@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count) ...@@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
} }
EXPORT_SYMBOL_GPL(kiocb_write_and_wait); EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) int filemap_invalidate_pages(struct address_space *mapping,
loff_t pos, loff_t end, bool nowait)
{ {
struct address_space *mapping = iocb->ki_filp->f_mapping;
loff_t pos = iocb->ki_pos;
loff_t end = pos + count - 1;
int ret; int ret;
if (iocb->ki_flags & IOCB_NOWAIT) { if (nowait) {
/* we could block if there are any pages in the range */ /* we could block if there are any pages in the range */
if (filemap_range_has_page(mapping, pos, end)) if (filemap_range_has_page(mapping, pos, end))
return -EAGAIN; return -EAGAIN;
...@@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count) ...@@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
end >> PAGE_SHIFT); end >> PAGE_SHIFT);
} }
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
return filemap_invalidate_pages(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1,
iocb->ki_flags & IOCB_NOWAIT);
}
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages); EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
/** /**
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment