Commit 9feb1af9 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus-20191205' of git://git.kernel.dk/linux-block

Pull more block and io_uring updates from Jens Axboe:
 "I wasn't expecting this to be so big, and if I was, I would have used
  separate branches for this. Going forward I'll be doing separate
  branches for the current tree, just like for the next kernel version
  tree. In any case, this contains:

   - Series from Christoph that fixes an inherent race condition with
     zoned devices and revalidation.

   - null_blk zone size fix (Damien)

   - Fix for a regression in this merge window that caused busy spins by
     sending empty disk uevents (Eric)

   - Fix for a regression in this merge window for bfq stats (Hou)

   - Fix for io_uring creds allocation failure handling (me)

   - io_uring -ERESTARTSYS send/recvmsg fix (me)

   - Series that fixes the need for applications to retain state across
     async request punts for io_uring. This one is a bit larger than I
     would have hoped, but I think it's important we get this fixed for
     5.5.

   - connect(2) improvement for io_uring, handling EINPROGRESS instead
     of having applications needing to poll for it (me)

   - Have io_uring use a hash for poll requests instead of an rbtree.
     This turned out to work much better in practice, so I think we
     should make the switch now. For some workloads, even with a fair
     amount of cancellations, the insertion sort is just too expensive.
     (me)

   - Various little io_uring fixes (me, Jackie, Pavel, LimingWu)

   - Fix for brd unaligned IO, and a warning for the future (Ming)

   - Fix for a bio integrity data leak (Justin)

   - bvec_iter_advance() improvement (Pavel)

   - Xen blkback page unmap fix (SeongJae)

  The major items in here are all well tested, and on the liburing side
  we continue to add regression and feature test cases. We're up to 50
  topic cases now, each with anywhere from 1 to more than 10 cases in
  each"

* tag 'for-linus-20191205' of git://git.kernel.dk/linux-block: (33 commits)
  block: fix memleak of bio integrity data
  io_uring: fix a typo in a comment
  bfq-iosched: Ensure bio->bi_blkg is valid before using it
  io_uring: hook all linked requests via link_list
  io_uring: fix error handling in io_queue_link_head
  io_uring: use hash table for poll command lookups
  io-wq: clear node->next on list deletion
  io_uring: ensure deferred timeouts copy necessary data
  io_uring: allow IO_SQE_* flags on IORING_OP_TIMEOUT
  null_blk: remove unused variable warning on !CONFIG_BLK_DEV_ZONED
  brd: warn on un-aligned buffer
  brd: remove max_hw_sectors queue limit
  xen/blkback: Avoid unmapping unmapped grant pages
  io_uring: handle connect -EINPROGRESS like -EAGAIN
  block: set the zone size in blk_revalidate_disk_zones atomically
  block: don't handle bio based drivers in blk_revalidate_disk_zones
  block: allocate the zone bitmaps lazily
  block: replace seq_zones_bitmap with conv_zones_bitmap
  block: simplify blkdev_nr_zones
  block: remove the empty line at the end of blk-zoned.c
  ...
parents 0aecba61 85394299
...@@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) ...@@ -351,6 +351,9 @@ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
{ {
struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
if (!bfqg)
return;
blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
} }
......
...@@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc); ...@@ -87,7 +87,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
* Description: Used to free the integrity portion of a bio. Usually * Description: Used to free the integrity portion of a bio. Usually
* called from bio_free(). * called from bio_free().
*/ */
static void bio_integrity_free(struct bio *bio) void bio_integrity_free(struct bio *bio)
{ {
struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_set *bs = bio->bi_pool; struct bio_set *bs = bio->bi_pool;
......
...@@ -233,6 +233,9 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, ...@@ -233,6 +233,9 @@ struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
void bio_uninit(struct bio *bio) void bio_uninit(struct bio *bio)
{ {
bio_disassociate_blkg(bio); bio_disassociate_blkg(bio);
if (bio_integrity(bio))
bio_integrity_free(bio);
} }
EXPORT_SYMBOL(bio_uninit); EXPORT_SYMBOL(bio_uninit);
......
...@@ -70,30 +70,20 @@ void __blk_req_zone_write_unlock(struct request *rq) ...@@ -70,30 +70,20 @@ void __blk_req_zone_write_unlock(struct request *rq)
} }
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
sector_t nr_sectors)
{
sector_t zone_sectors = blk_queue_zone_sectors(q);
return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
}
/** /**
* blkdev_nr_zones - Get number of zones * blkdev_nr_zones - Get number of zones
* @bdev: Target block device * @disk: Target gendisk
* *
* Description: * Return the total number of zones of a zoned block device. For a block
* Return the total number of zones of a zoned block device. * device without zone capabilities, the number of zones is always 0.
* For a regular block device, the number of zones is always 0.
*/ */
unsigned int blkdev_nr_zones(struct block_device *bdev) unsigned int blkdev_nr_zones(struct gendisk *disk)
{ {
struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
if (!blk_queue_is_zoned(q)) if (!blk_queue_is_zoned(disk->queue))
return 0; return 0;
return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
return __blkdev_nr_zones(q, get_capacity(bdev->bd_disk));
} }
EXPORT_SYMBOL_GPL(blkdev_nr_zones); EXPORT_SYMBOL_GPL(blkdev_nr_zones);
...@@ -342,16 +332,18 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node, ...@@ -342,16 +332,18 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
void blk_queue_free_zone_bitmaps(struct request_queue *q) void blk_queue_free_zone_bitmaps(struct request_queue *q)
{ {
kfree(q->seq_zones_bitmap); kfree(q->conv_zones_bitmap);
q->seq_zones_bitmap = NULL; q->conv_zones_bitmap = NULL;
kfree(q->seq_zones_wlock); kfree(q->seq_zones_wlock);
q->seq_zones_wlock = NULL; q->seq_zones_wlock = NULL;
} }
struct blk_revalidate_zone_args { struct blk_revalidate_zone_args {
struct gendisk *disk; struct gendisk *disk;
unsigned long *seq_zones_bitmap; unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock; unsigned long *seq_zones_wlock;
unsigned int nr_zones;
sector_t zone_sectors;
sector_t sector; sector_t sector;
}; };
...@@ -364,25 +356,33 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, ...@@ -364,25 +356,33 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
struct blk_revalidate_zone_args *args = data; struct blk_revalidate_zone_args *args = data;
struct gendisk *disk = args->disk; struct gendisk *disk = args->disk;
struct request_queue *q = disk->queue; struct request_queue *q = disk->queue;
sector_t zone_sectors = blk_queue_zone_sectors(q);
sector_t capacity = get_capacity(disk); sector_t capacity = get_capacity(disk);
/* /*
* All zones must have the same size, with the exception on an eventual * All zones must have the same size, with the exception on an eventual
* smaller last zone. * smaller last zone.
*/ */
if (zone->start + zone_sectors < capacity && if (zone->start == 0) {
zone->len != zone_sectors) { if (zone->len == 0 || !is_power_of_2(zone->len)) {
pr_warn("%s: Invalid zoned device with non constant zone size\n", pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
disk->disk_name); disk->disk_name, zone->len);
return false; return -ENODEV;
} }
if (zone->start + zone->len >= capacity && args->zone_sectors = zone->len;
zone->len > zone_sectors) { args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
pr_warn("%s: Invalid zoned device with larger last zone size\n", } else if (zone->start + args->zone_sectors < capacity) {
disk->disk_name); if (zone->len != args->zone_sectors) {
return -ENODEV; pr_warn("%s: Invalid zoned device with non constant zone size\n",
disk->disk_name);
return -ENODEV;
}
} else {
if (zone->len > args->zone_sectors) {
pr_warn("%s: Invalid zoned device with larger last zone size\n",
disk->disk_name);
return -ENODEV;
}
} }
/* Check for holes in the zone report */ /* Check for holes in the zone report */
...@@ -395,8 +395,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, ...@@ -395,8 +395,22 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
/* Check zone type */ /* Check zone type */
switch (zone->type) { switch (zone->type) {
case BLK_ZONE_TYPE_CONVENTIONAL: case BLK_ZONE_TYPE_CONVENTIONAL:
if (!args->conv_zones_bitmap) {
args->conv_zones_bitmap =
blk_alloc_zone_bitmap(q->node, args->nr_zones);
if (!args->conv_zones_bitmap)
return -ENOMEM;
}
set_bit(idx, args->conv_zones_bitmap);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_REQ:
case BLK_ZONE_TYPE_SEQWRITE_PREF: case BLK_ZONE_TYPE_SEQWRITE_PREF:
if (!args->seq_zones_wlock) {
args->seq_zones_wlock =
blk_alloc_zone_bitmap(q->node, args->nr_zones);
if (!args->seq_zones_wlock)
return -ENOMEM;
}
break; break;
default: default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
...@@ -404,78 +418,54 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, ...@@ -404,78 +418,54 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENODEV; return -ENODEV;
} }
if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
set_bit(idx, args->seq_zones_bitmap);
args->sector += zone->len; args->sector += zone->len;
return 0; return 0;
} }
static int blk_update_zone_info(struct gendisk *disk, unsigned int nr_zones,
struct blk_revalidate_zone_args *args)
{
/*
* Ensure that all memory allocations in this context are done as
* if GFP_NOIO was specified.
*/
unsigned int noio_flag = memalloc_noio_save();
struct request_queue *q = disk->queue;
int ret;
args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones);
if (!args->seq_zones_wlock)
return -ENOMEM;
args->seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones);
if (!args->seq_zones_bitmap)
return -ENOMEM;
ret = disk->fops->report_zones(disk, 0, nr_zones,
blk_revalidate_zone_cb, args);
memalloc_noio_restore(noio_flag);
return ret;
}
/** /**
* blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
* @disk: Target disk * @disk: Target disk
* *
* Helper function for low-level device drivers to (re) allocate and initialize * Helper function for low-level device drivers to (re) allocate and initialize
* a disk request queue zone bitmaps. This functions should normally be called * a disk request queue zone bitmaps. This functions should normally be called
* within the disk ->revalidate method. For BIO based queues, no zone bitmap * within the disk ->revalidate method for blk-mq based drivers. For BIO based
* is allocated. * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
* is correct.
*/ */
int blk_revalidate_disk_zones(struct gendisk *disk) int blk_revalidate_disk_zones(struct gendisk *disk)
{ {
struct request_queue *q = disk->queue; struct request_queue *q = disk->queue;
unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk)); struct blk_revalidate_zone_args args = {
struct blk_revalidate_zone_args args = { .disk = disk }; .disk = disk,
int ret = 0; };
unsigned int noio_flag;
int ret;
if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
return -EIO; return -EIO;
if (WARN_ON_ONCE(!queue_is_mq(q)))
return -EIO;
/* /*
* BIO based queues do not use a scheduler so only q->nr_zones * Ensure that all memory allocations in this context are done as if
* needs to be updated so that the sysfs exposed value is correct. * GFP_NOIO was specified.
*/ */
if (!queue_is_mq(q)) { noio_flag = memalloc_noio_save();
q->nr_zones = nr_zones; ret = disk->fops->report_zones(disk, 0, UINT_MAX,
return 0; blk_revalidate_zone_cb, &args);
} memalloc_noio_restore(noio_flag);
if (nr_zones)
ret = blk_update_zone_info(disk, nr_zones, &args);
/* /*
* Install the new bitmaps, making sure the queue is stopped and * Install the new bitmaps and update nr_zones only once the queue is
* all I/Os are completed (i.e. a scheduler is not referencing the * stopped and all I/Os are completed (i.e. a scheduler is not
* bitmaps). * referencing the bitmaps).
*/ */
blk_mq_freeze_queue(q); blk_mq_freeze_queue(q);
if (ret >= 0) { if (ret >= 0) {
q->nr_zones = nr_zones; blk_queue_chunk_sectors(q, args.zone_sectors);
q->nr_zones = args.nr_zones;
swap(q->seq_zones_wlock, args.seq_zones_wlock); swap(q->seq_zones_wlock, args.seq_zones_wlock);
swap(q->seq_zones_bitmap, args.seq_zones_bitmap); swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
ret = 0; ret = 0;
} else { } else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name); pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
...@@ -484,8 +474,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) ...@@ -484,8 +474,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
blk_mq_unfreeze_queue(q); blk_mq_unfreeze_queue(q);
kfree(args.seq_zones_wlock); kfree(args.seq_zones_wlock);
kfree(args.seq_zones_bitmap); kfree(args.conv_zones_bitmap);
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
...@@ -121,6 +121,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, ...@@ -121,6 +121,7 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
#ifdef CONFIG_BLK_DEV_INTEGRITY #ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void); void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *); bool __bio_integrity_endio(struct bio *);
void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio) static inline bool bio_integrity_endio(struct bio *bio)
{ {
if (bio_integrity(bio)) if (bio_integrity(bio))
...@@ -166,6 +167,9 @@ static inline bool bio_integrity_endio(struct bio *bio) ...@@ -166,6 +167,9 @@ static inline bool bio_integrity_endio(struct bio *bio)
{ {
return true; return true;
} }
static inline void bio_integrity_free(struct bio *bio)
{
}
#endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLK_DEV_INTEGRITY */
unsigned long blk_rq_timeout(unsigned long timeout); unsigned long blk_rq_timeout(unsigned long timeout);
......
...@@ -512,7 +512,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, ...@@ -512,7 +512,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKGETZONESZ: case BLKGETZONESZ:
return put_uint(arg, bdev_zone_sectors(bdev)); return put_uint(arg, bdev_zone_sectors(bdev));
case BLKGETNRZONES: case BLKGETNRZONES:
return put_uint(arg, blkdev_nr_zones(bdev)); return put_uint(arg, blkdev_nr_zones(bdev->bd_disk));
case HDIO_GETGEO: case HDIO_GETGEO:
return blkdev_getgeo(bdev, argp); return blkdev_getgeo(bdev, argp);
case BLKRAGET: case BLKRAGET:
......
...@@ -297,6 +297,10 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) ...@@ -297,6 +297,10 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)
unsigned int len = bvec.bv_len; unsigned int len = bvec.bv_len;
int err; int err;
/* Don't support un-aligned buffer */
WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
(len & (SECTOR_SIZE - 1)));
err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset, err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
bio_op(bio), sector); bio_op(bio), sector);
if (err) if (err)
...@@ -382,7 +386,6 @@ static struct brd_device *brd_alloc(int i) ...@@ -382,7 +386,6 @@ static struct brd_device *brd_alloc(int i)
goto out_free_dev; goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request); blk_queue_make_request(brd->brd_queue, brd_make_request);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
/* This is so fdisk will align partitions on 4k, because of /* This is so fdisk will align partitions on 4k, because of
* direct_access API needing 4k alignment, returning a PFN * direct_access API needing 4k alignment, returning a PFN
......
...@@ -1559,14 +1559,13 @@ static int init_driver_queues(struct nullb *nullb) ...@@ -1559,14 +1559,13 @@ static int init_driver_queues(struct nullb *nullb)
static int null_gendisk_register(struct nullb *nullb) static int null_gendisk_register(struct nullb *nullb)
{ {
sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
struct gendisk *disk; struct gendisk *disk;
sector_t size;
disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node); disk = nullb->disk = alloc_disk_node(1, nullb->dev->home_node);
if (!disk) if (!disk)
return -ENOMEM; return -ENOMEM;
size = (sector_t)nullb->dev->size * 1024 * 1024ULL; set_capacity(disk, size);
set_capacity(disk, size >> 9);
disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->major = null_major; disk->major = null_major;
...@@ -1576,12 +1575,19 @@ static int null_gendisk_register(struct nullb *nullb) ...@@ -1576,12 +1575,19 @@ static int null_gendisk_register(struct nullb *nullb)
disk->queue = nullb->q; disk->queue = nullb->q;
strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
#ifdef CONFIG_BLK_DEV_ZONED
if (nullb->dev->zoned) { if (nullb->dev->zoned) {
int ret = blk_revalidate_disk_zones(disk); if (queue_is_mq(nullb->q)) {
int ret = blk_revalidate_disk_zones(disk);
if (ret != 0) if (ret)
return ret; return ret;
} else {
blk_queue_chunk_sectors(nullb->q,
nullb->dev->zone_size_sects);
nullb->q->nr_zones = blkdev_nr_zones(disk);
}
} }
#endif
add_disk(disk); add_disk(disk);
return 0; return 0;
...@@ -1607,7 +1613,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) ...@@ -1607,7 +1613,7 @@ static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
return blk_mq_alloc_tag_set(set); return blk_mq_alloc_tag_set(set);
} }
static void null_validate_conf(struct nullb_device *dev) static int null_validate_conf(struct nullb_device *dev)
{ {
dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
...@@ -1634,6 +1640,14 @@ static void null_validate_conf(struct nullb_device *dev) ...@@ -1634,6 +1640,14 @@ static void null_validate_conf(struct nullb_device *dev)
/* can not stop a queue */ /* can not stop a queue */
if (dev->queue_mode == NULL_Q_BIO) if (dev->queue_mode == NULL_Q_BIO)
dev->mbps = 0; dev->mbps = 0;
if (dev->zoned &&
(!dev->zone_size || !is_power_of_2(dev->zone_size))) {
pr_err("zone_size must be power-of-two\n");
return -EINVAL;
}
return 0;
} }
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
...@@ -1666,7 +1680,9 @@ static int null_add_dev(struct nullb_device *dev) ...@@ -1666,7 +1680,9 @@ static int null_add_dev(struct nullb_device *dev)
struct nullb *nullb; struct nullb *nullb;
int rv; int rv;
null_validate_conf(dev); rv = null_validate_conf(dev);
if (rv)
return rv;
nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node); nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
if (!nullb) { if (!nullb) {
...@@ -1731,7 +1747,6 @@ static int null_add_dev(struct nullb_device *dev) ...@@ -1731,7 +1747,6 @@ static int null_add_dev(struct nullb_device *dev)
if (rv) if (rv)
goto out_cleanup_blk_queue; goto out_cleanup_blk_queue;
blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
nullb->q->limits.zoned = BLK_ZONED_HM; nullb->q->limits.zoned = BLK_ZONED_HM;
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, nullb->q);
blk_queue_required_elevator_features(nullb->q, blk_queue_required_elevator_features(nullb->q,
...@@ -1792,11 +1807,6 @@ static int __init null_init(void) ...@@ -1792,11 +1807,6 @@ static int __init null_init(void)
g_bs = PAGE_SIZE; g_bs = PAGE_SIZE;
} }
if (!is_power_of_2(g_zone_size)) {
pr_err("zone_size must be power-of-two\n");
return -EINVAL;
}
if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) { if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
pr_err("invalid home_node value\n"); pr_err("invalid home_node value\n");
g_home_node = NUMA_NO_NODE; g_home_node = NUMA_NO_NODE;
......
...@@ -936,6 +936,8 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring, ...@@ -936,6 +936,8 @@ static int xen_blkbk_map(struct xen_blkif_ring *ring,
out_of_memory: out_of_memory:
pr_alert("%s: out of memory\n", __func__); pr_alert("%s: out of memory\n", __func__);
put_free_pages(ring, pages_to_gnt, segs_to_map); put_free_pages(ring, pages_to_gnt, segs_to_map);
for (i = last_map; i < num; i++)
pages[i]->handle = BLKBACK_INVALID_HANDLE;
return -ENOMEM; return -ENOMEM;
} }
......
...@@ -1954,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, ...@@ -1954,12 +1954,14 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
/* /*
* For a zoned target, the number of zones should be updated for the * For a zoned target, the number of zones should be updated for the
* correct value to be exposed in sysfs queue/nr_zones. For a BIO based * correct value to be exposed in sysfs queue/nr_zones. For a BIO based
* target, this is all that is needed. For a request based target, the * target, this is all that is needed.
* queue zone bitmaps must also be updated.
* Use blk_revalidate_disk_zones() to handle this.
*/ */
if (blk_queue_is_zoned(q)) #ifdef CONFIG_BLK_DEV_ZONED
blk_revalidate_disk_zones(t->md->disk); if (blk_queue_is_zoned(q)) {
WARN_ON_ONCE(queue_is_mq(q));
q->nr_zones = blkdev_nr_zones(t->md->disk);
}
#endif
/* Allow reads to exceed readahead limits */ /* Allow reads to exceed readahead limits */
q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9); q->backing_dev_info->io_pages = limits->max_sectors >> (PAGE_SHIFT - 9);
......
...@@ -727,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path) ...@@ -727,7 +727,7 @@ static int dmz_get_zoned_device(struct dm_target *ti, char *path)
dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors); dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks); dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
dev->nr_zones = blkdev_nr_zones(dev->bdev); dev->nr_zones = blkdev_nr_zones(dev->bdev->bd_disk);
dmz->dev = dev; dmz->dev = dev;
......
...@@ -412,8 +412,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) ...@@ -412,8 +412,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf)
goto err; goto err;
/* The drive satisfies the kernel restrictions: set it up */ /* The drive satisfies the kernel restrictions: set it up */
blk_queue_chunk_sectors(sdkp->disk->queue,
logical_to_sectors(sdkp->device, zone_blocks));
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, sdkp->disk->queue);
blk_queue_required_elevator_features(sdkp->disk->queue, blk_queue_required_elevator_features(sdkp->disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE); ELEVATOR_F_ZBD_SEQ_WRITE);
......
...@@ -1531,7 +1531,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) ...@@ -1531,7 +1531,7 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate)
ret = blk_add_partitions(disk, bdev); ret = blk_add_partitions(disk, bdev);
if (ret == -EAGAIN) if (ret == -EAGAIN)
goto rescan; goto rescan;
} else { } else if (invalidate) {
/* /*
* Tell userspace that the media / partition table may have * Tell userspace that the media / partition table may have
* changed. * changed.
......
...@@ -111,7 +111,7 @@ struct io_wq { ...@@ -111,7 +111,7 @@ struct io_wq {
struct task_struct *manager; struct task_struct *manager;
struct user_struct *user; struct user_struct *user;
struct cred *creds; const struct cred *creds;
struct mm_struct *mm; struct mm_struct *mm;
refcount_t refs; refcount_t refs;
struct completion done; struct completion done;
......
...@@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list, ...@@ -52,6 +52,7 @@ static inline void wq_node_del(struct io_wq_work_list *list,
list->last = prev; list->last = prev;
if (prev) if (prev)
prev->next = node->next; prev->next = node->next;
node->next = NULL;
} }
#define wq_list_for_each(pos, prv, head) \ #define wq_list_for_each(pos, prv, head) \
...@@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *); ...@@ -87,7 +88,7 @@ typedef void (put_work_fn)(struct io_wq_work *);
struct io_wq_data { struct io_wq_data {
struct mm_struct *mm; struct mm_struct *mm;
struct user_struct *user; struct user_struct *user;
struct cred *creds; const struct cred *creds;
get_work_fn *get_work; get_work_fn *get_work;
put_work_fn *put_work; put_work_fn *put_work;
...@@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk) ...@@ -118,10 +119,6 @@ static inline void io_wq_worker_sleeping(struct task_struct *tsk)
static inline void io_wq_worker_running(struct task_struct *tsk) static inline void io_wq_worker_running(struct task_struct *tsk)
{ {
} }
#endif #endif /* CONFIG_IO_WQ */
static inline bool io_wq_current_is_worker(void) #endif /* INTERNAL_IO_WQ_H */
{
return in_task() && (current->flags & PF_IO_WORKER);
}
#endif
...@@ -145,7 +145,7 @@ struct io_rings { ...@@ -145,7 +145,7 @@ struct io_rings {
/* /*
* Number of completion events lost because the queue was full; * Number of completion events lost because the queue was full;
* this should be avoided by the application by making sure * this should be avoided by the application by making sure
* there are not more requests pending thatn there is space in * there are not more requests pending than there is space in
* the completion queue. * the completion queue.
* *
* Written by the kernel, shouldn't be modified by the * Written by the kernel, shouldn't be modified by the
...@@ -238,7 +238,7 @@ struct io_ring_ctx { ...@@ -238,7 +238,7 @@ struct io_ring_ctx {
struct user_struct *user; struct user_struct *user;
struct cred *creds; const struct cred *creds;
/* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */ /* 0 is for ctx quiesce/reinit/free, 1 is for sqo_thread started */
struct completion *completions; struct completion *completions;
...@@ -275,7 +275,8 @@ struct io_ring_ctx { ...@@ -275,7 +275,8 @@ struct io_ring_ctx {
* manipulate the list, hence no extra locking is needed there. * manipulate the list, hence no extra locking is needed there.
*/ */
struct list_head poll_list; struct list_head poll_list;
struct rb_root cancel_tree; struct hlist_head *cancel_hash;
unsigned cancel_hash_bits;
spinlock_t inflight_lock; spinlock_t inflight_lock;
struct list_head inflight_list; struct list_head inflight_list;
...@@ -303,9 +304,32 @@ struct io_timeout_data { ...@@ -303,9 +304,32 @@ struct io_timeout_data {
u32 seq_offset; u32 seq_offset;
}; };
struct io_timeout { struct io_async_connect {
struct file *file; struct sockaddr_storage address;
struct io_timeout_data *data; };
struct io_async_msghdr {
struct iovec fast_iov[UIO_FASTIOV];
struct iovec *iov;
struct sockaddr __user *uaddr;
struct msghdr msg;
};
struct io_async_rw {
struct iovec fast_iov[UIO_FASTIOV];
struct iovec *iov;
ssize_t nr_segs;
ssize_t size;
};
struct io_async_ctx {
struct io_uring_sqe sqe;
union {
struct io_async_rw rw;
struct io_async_msghdr msg;
struct io_async_connect connect;
struct io_timeout_data timeout;
};
}; };
/* /*
...@@ -319,10 +343,10 @@ struct io_kiocb { ...@@ -319,10 +343,10 @@ struct io_kiocb {
struct file *file; struct file *file;
struct kiocb rw; struct kiocb rw;
struct io_poll_iocb poll; struct io_poll_iocb poll;
struct io_timeout timeout;
}; };
const struct io_uring_sqe *sqe; const struct io_uring_sqe *sqe;
struct io_async_ctx *io;
struct file *ring_file; struct file *ring_file;
int ring_fd; int ring_fd;
bool has_user; bool has_user;
...@@ -332,7 +356,7 @@ struct io_kiocb { ...@@ -332,7 +356,7 @@ struct io_kiocb {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
union { union {
struct list_head list; struct list_head list;
struct rb_node rb_node; struct hlist_node hash_node;
}; };
struct list_head link_list; struct list_head link_list;
unsigned int flags; unsigned int flags;
...@@ -353,7 +377,6 @@ struct io_kiocb { ...@@ -353,7 +377,6 @@ struct io_kiocb {
#define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */ #define REQ_F_TIMEOUT_NOSEQ 8192 /* no timeout sequence */
#define REQ_F_INFLIGHT 16384 /* on inflight list */ #define REQ_F_INFLIGHT 16384 /* on inflight list */
#define REQ_F_COMP_LOCKED 32768 /* completion under lock */ #define REQ_F_COMP_LOCKED 32768 /* completion under lock */
#define REQ_F_FREE_SQE 65536 /* free sqe if not async queued */
u64 user_data; u64 user_data;
u32 result; u32 result;
u32 sequence; u32 sequence;
...@@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref) ...@@ -422,6 +445,7 @@ static void io_ring_ctx_ref_free(struct percpu_ref *ref)
static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
{ {
struct io_ring_ctx *ctx; struct io_ring_ctx *ctx;
int hash_bits;
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
if (!ctx) if (!ctx)
...@@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -435,6 +459,21 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (!ctx->completions) if (!ctx->completions)
goto err; goto err;
/*
* Use 5 bits less than the max cq entries, that should give us around
* 32 entries per hash list if totally full and uniformly spread.
*/
hash_bits = ilog2(p->cq_entries);
hash_bits -= 5;
if (hash_bits <= 0)
hash_bits = 1;
ctx->cancel_hash_bits = hash_bits;
ctx->cancel_hash = kmalloc((1U << hash_bits) * sizeof(struct hlist_head),
GFP_KERNEL);
if (!ctx->cancel_hash)
goto err;
__hash_init(ctx->cancel_hash, 1U << hash_bits);
if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
PERCPU_REF_ALLOW_REINIT, GFP_KERNEL)) PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
goto err; goto err;
...@@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -448,7 +487,6 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
init_waitqueue_head(&ctx->wait); init_waitqueue_head(&ctx->wait);
spin_lock_init(&ctx->completion_lock); spin_lock_init(&ctx->completion_lock);
INIT_LIST_HEAD(&ctx->poll_list); INIT_LIST_HEAD(&ctx->poll_list);
ctx->cancel_tree = RB_ROOT;
INIT_LIST_HEAD(&ctx->defer_list); INIT_LIST_HEAD(&ctx->defer_list);
INIT_LIST_HEAD(&ctx->timeout_list); INIT_LIST_HEAD(&ctx->timeout_list);
init_waitqueue_head(&ctx->inflight_wait); init_waitqueue_head(&ctx->inflight_wait);
...@@ -459,6 +497,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) ...@@ -459,6 +497,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
if (ctx->fallback_req) if (ctx->fallback_req)
kmem_cache_free(req_cachep, ctx->fallback_req); kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx->completions); kfree(ctx->completions);
kfree(ctx->cancel_hash);
kfree(ctx); kfree(ctx);
return NULL; return NULL;
} }
...@@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req) ...@@ -592,7 +631,7 @@ static void io_kill_timeout(struct io_kiocb *req)
{ {
int ret; int ret;
ret = hrtimer_try_to_cancel(&req->timeout.data->timer); ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) { if (ret != -1) {
atomic_inc(&req->ctx->cq_timeouts); atomic_inc(&req->ctx->cq_timeouts);
list_del_init(&req->list); list_del_init(&req->list);
...@@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx, ...@@ -806,6 +845,7 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
} }
got_it: got_it:
req->io = NULL;
req->ring_file = NULL; req->ring_file = NULL;
req->file = NULL; req->file = NULL;
req->ctx = ctx; req->ctx = ctx;
...@@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req) ...@@ -836,8 +876,8 @@ static void __io_free_req(struct io_kiocb *req)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
if (req->flags & REQ_F_FREE_SQE) if (req->io)
kfree(req->sqe); kfree(req->io);
if (req->file && !(req->flags & REQ_F_FIXED_FILE)) if (req->file && !(req->flags & REQ_F_FIXED_FILE))
fput(req->file); fput(req->file);
if (req->flags & REQ_F_INFLIGHT) { if (req->flags & REQ_F_INFLIGHT) {
...@@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req) ...@@ -849,8 +889,6 @@ static void __io_free_req(struct io_kiocb *req)
wake_up(&ctx->inflight_wait); wake_up(&ctx->inflight_wait);
spin_unlock_irqrestore(&ctx->inflight_lock, flags); spin_unlock_irqrestore(&ctx->inflight_lock, flags);
} }
if (req->flags & REQ_F_TIMEOUT)
kfree(req->timeout.data);
percpu_ref_put(&ctx->refs); percpu_ref_put(&ctx->refs);
if (likely(!io_is_fallback_req(req))) if (likely(!io_is_fallback_req(req)))
kmem_cache_free(req_cachep, req); kmem_cache_free(req_cachep, req);
...@@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) ...@@ -863,7 +901,7 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
int ret; int ret;
ret = hrtimer_try_to_cancel(&req->timeout.data->timer); ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret != -1) { if (ret != -1) {
io_cqring_fill_event(req, -ECANCELED); io_cqring_fill_event(req, -ECANCELED);
io_commit_cqring(ctx); io_commit_cqring(ctx);
...@@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req) ...@@ -878,7 +916,6 @@ static bool io_link_cancel_timeout(struct io_kiocb *req)
static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *nxt;
bool wake_ev = false; bool wake_ev = false;
/* Already got next link */ /* Already got next link */
...@@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) ...@@ -890,24 +927,21 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
* potentially happen if the chain is messed up, check to be on the * potentially happen if the chain is messed up, check to be on the
* safe side. * safe side.
*/ */
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); while (!list_empty(&req->link_list)) {
while (nxt) { struct io_kiocb *nxt = list_first_entry(&req->link_list,
list_del_init(&nxt->list); struct io_kiocb, link_list);
if ((req->flags & REQ_F_LINK_TIMEOUT) && if (unlikely((req->flags & REQ_F_LINK_TIMEOUT) &&
(nxt->flags & REQ_F_TIMEOUT)) { (nxt->flags & REQ_F_TIMEOUT))) {
list_del_init(&nxt->link_list);
wake_ev |= io_link_cancel_timeout(nxt); wake_ev |= io_link_cancel_timeout(nxt);
nxt = list_first_entry_or_null(&req->link_list,
struct io_kiocb, list);
req->flags &= ~REQ_F_LINK_TIMEOUT; req->flags &= ~REQ_F_LINK_TIMEOUT;
continue; continue;
} }
if (!list_empty(&req->link_list)) {
INIT_LIST_HEAD(&nxt->link_list);
list_splice(&req->link_list, &nxt->link_list);
nxt->flags |= REQ_F_LINK;
}
list_del_init(&req->link_list);
if (!list_empty(&nxt->link_list))
nxt->flags |= REQ_F_LINK;
*nxtptr = nxt; *nxtptr = nxt;
break; break;
} }
...@@ -923,15 +957,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr) ...@@ -923,15 +957,15 @@ static void io_req_link_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
static void io_fail_links(struct io_kiocb *req) static void io_fail_links(struct io_kiocb *req)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_kiocb *link;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&ctx->completion_lock, flags); spin_lock_irqsave(&ctx->completion_lock, flags);
while (!list_empty(&req->link_list)) { while (!list_empty(&req->link_list)) {
link = list_first_entry(&req->link_list, struct io_kiocb, list); struct io_kiocb *link = list_first_entry(&req->link_list,
list_del_init(&link->list); struct io_kiocb, link_list);
list_del_init(&link->link_list);
trace_io_uring_fail_link(req, link); trace_io_uring_fail_link(req, link);
if ((req->flags & REQ_F_LINK_TIMEOUT) && if ((req->flags & REQ_F_LINK_TIMEOUT) &&
...@@ -1079,9 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, ...@@ -1079,9 +1113,9 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events,
* completions for those, only batch free for fixed * completions for those, only batch free for fixed
* file and non-linked commands. * file and non-linked commands.
*/ */
if (((req->flags & if (((req->flags & (REQ_F_FIXED_FILE|REQ_F_LINK)) ==
(REQ_F_FIXED_FILE|REQ_F_LINK|REQ_F_FREE_SQE)) == REQ_F_FIXED_FILE) && !io_is_fallback_req(req) &&
REQ_F_FIXED_FILE) && !io_is_fallback_req(req)) { !req->io) {
reqs[to_free++] = req; reqs[to_free++] = req;
if (to_free == ARRAY_SIZE(reqs)) if (to_free == ARRAY_SIZE(reqs))
io_free_req_many(ctx, reqs, &to_free); io_free_req_many(ctx, reqs, &to_free);
...@@ -1410,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock) ...@@ -1410,15 +1444,6 @@ static int io_prep_rw(struct io_kiocb *req, bool force_nonblock)
if (S_ISREG(file_inode(req->file)->i_mode)) if (S_ISREG(file_inode(req->file)->i_mode))
req->flags |= REQ_F_ISREG; req->flags |= REQ_F_ISREG;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
if (force_nonblock && !io_file_supports_async(req->file)) {
req->flags |= REQ_F_MUST_PUNT;
return -EAGAIN;
}
kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_pos = READ_ONCE(sqe->off);
kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_flags = iocb_flags(kiocb->ki_filp);
kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp));
...@@ -1587,6 +1612,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req, ...@@ -1587,6 +1612,16 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
return io_import_fixed(req->ctx, rw, sqe, iter); return io_import_fixed(req->ctx, rw, sqe, iter);
} }
if (req->io) {
struct io_async_rw *iorw = &req->io->rw;
*iovec = iorw->iov;
iov_iter_init(iter, rw, *iovec, iorw->nr_segs, iorw->size);
if (iorw->iov == iorw->fast_iov)
*iovec = NULL;
return iorw->size;
}
if (!req->has_user) if (!req->has_user)
return -EFAULT; return -EFAULT;
...@@ -1657,6 +1692,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb, ...@@ -1657,6 +1692,50 @@ static ssize_t loop_rw_iter(int rw, struct file *file, struct kiocb *kiocb,
return ret; return ret;
} }
static void io_req_map_io(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
req->io->rw.nr_segs = iter->nr_segs;
req->io->rw.size = io_size;
req->io->rw.iov = iovec;
if (!req->io->rw.iov) {
req->io->rw.iov = req->io->rw.fast_iov;
memcpy(req->io->rw.iov, fast_iov,
sizeof(struct iovec) * iter->nr_segs);
}
}
static int io_setup_async_io(struct io_kiocb *req, ssize_t io_size,
struct iovec *iovec, struct iovec *fast_iov,
struct iov_iter *iter)
{
req->io = kmalloc(sizeof(*req->io), GFP_KERNEL);
if (req->io) {
io_req_map_io(req, io_size, iovec, fast_iov, iter);
memcpy(&req->io->sqe, req->sqe, sizeof(req->io->sqe));
req->sqe = &req->io->sqe;
return 0;
}
return -ENOMEM;
}
static int io_read_prep(struct io_kiocb *req, struct iovec **iovec,
struct iov_iter *iter, bool force_nonblock)
{
ssize_t ret;
ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
if (unlikely(!(req->file->f_mode & FMODE_READ)))
return -EBADF;
return io_import_iovec(READ, req, iovec, iter);
}
static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock) bool force_nonblock)
{ {
...@@ -1665,23 +1744,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -1665,23 +1744,31 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
struct iov_iter iter; struct iov_iter iter;
struct file *file; struct file *file;
size_t iov_count; size_t iov_count;
ssize_t read_size, ret; ssize_t io_size, ret;
ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
file = kiocb->ki_filp;
if (unlikely(!(file->f_mode & FMODE_READ))) if (!req->io) {
return -EBADF; ret = io_read_prep(req, &iovec, &iter, force_nonblock);
if (ret < 0)
ret = io_import_iovec(READ, req, &iovec, &iter); return ret;
if (ret < 0) } else {
return ret; ret = io_import_iovec(READ, req, &iovec, &iter);
if (ret < 0)
return ret;
}
read_size = ret; file = req->file;
io_size = ret;
if (req->flags & REQ_F_LINK) if (req->flags & REQ_F_LINK)
req->result = read_size; req->result = io_size;
/*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
if (force_nonblock && !io_file_supports_async(file)) {
req->flags |= REQ_F_MUST_PUNT;
goto copy_iov;
}
iov_count = iov_iter_count(&iter); iov_count = iov_iter_count(&iter);
ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count); ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_count);
...@@ -1703,18 +1790,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -1703,18 +1790,40 @@ static int io_read(struct io_kiocb *req, struct io_kiocb **nxt,
*/ */
if (force_nonblock && !(req->flags & REQ_F_NOWAIT) && if (force_nonblock && !(req->flags & REQ_F_NOWAIT) &&
(req->flags & REQ_F_ISREG) && (req->flags & REQ_F_ISREG) &&
ret2 > 0 && ret2 < read_size) ret2 > 0 && ret2 < io_size)
ret2 = -EAGAIN; ret2 = -EAGAIN;
/* Catch -EAGAIN return for forced non-blocking submission */ /* Catch -EAGAIN return for forced non-blocking submission */
if (!force_nonblock || ret2 != -EAGAIN) if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async); kiocb_done(kiocb, ret2, nxt, req->in_async);
else } else {
ret = -EAGAIN; copy_iov:
ret = io_setup_async_io(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
return -EAGAIN;
}
} }
out_free:
kfree(iovec); kfree(iovec);
return ret; return ret;
} }
static int io_write_prep(struct io_kiocb *req, struct iovec **iovec,
struct iov_iter *iter, bool force_nonblock)
{
ssize_t ret;
ret = io_prep_rw(req, force_nonblock);
if (ret)
return ret;
if (unlikely(!(req->file->f_mode & FMODE_WRITE)))
return -EBADF;
return io_import_iovec(WRITE, req, iovec, iter);
}
static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
bool force_nonblock) bool force_nonblock)
{ {
...@@ -1723,29 +1832,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -1723,29 +1832,36 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
struct iov_iter iter; struct iov_iter iter;
struct file *file; struct file *file;
size_t iov_count; size_t iov_count;
ssize_t ret; ssize_t ret, io_size;
ret = io_prep_rw(req, force_nonblock); if (!req->io) {
if (ret) ret = io_write_prep(req, &iovec, &iter, force_nonblock);
return ret; if (ret < 0)
return ret;
} else {
ret = io_import_iovec(WRITE, req, &iovec, &iter);
if (ret < 0)
return ret;
}
file = kiocb->ki_filp; file = kiocb->ki_filp;
if (unlikely(!(file->f_mode & FMODE_WRITE))) io_size = ret;
return -EBADF;
ret = io_import_iovec(WRITE, req, &iovec, &iter);
if (ret < 0)
return ret;
if (req->flags & REQ_F_LINK) if (req->flags & REQ_F_LINK)
req->result = ret; req->result = io_size;
iov_count = iov_iter_count(&iter); /*
* If the file doesn't support async, mark it as REQ_F_MUST_PUNT so
* we know to async punt it even if it was opened O_NONBLOCK
*/
if (force_nonblock && !io_file_supports_async(req->file)) {
req->flags |= REQ_F_MUST_PUNT;
goto copy_iov;
}
ret = -EAGAIN;
if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT))
goto out_free; goto copy_iov;
iov_count = iov_iter_count(&iter);
ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count); ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, iov_count);
if (!ret) { if (!ret) {
ssize_t ret2; ssize_t ret2;
...@@ -1769,10 +1885,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt, ...@@ -1769,10 +1885,16 @@ static int io_write(struct io_kiocb *req, struct io_kiocb **nxt,
ret2 = call_write_iter(file, kiocb, &iter); ret2 = call_write_iter(file, kiocb, &iter);
else else
ret2 = loop_rw_iter(WRITE, file, kiocb, &iter); ret2 = loop_rw_iter(WRITE, file, kiocb, &iter);
if (!force_nonblock || ret2 != -EAGAIN) if (!force_nonblock || ret2 != -EAGAIN) {
kiocb_done(kiocb, ret2, nxt, req->in_async); kiocb_done(kiocb, ret2, nxt, req->in_async);
else } else {
ret = -EAGAIN; copy_iov:
ret = io_setup_async_io(req, io_size, iovec,
inline_vecs, &iter);
if (ret)
goto out_free;
return -EAGAIN;
}
} }
out_free: out_free:
kfree(iovec); kfree(iovec);
...@@ -1888,12 +2010,25 @@ static int io_sync_file_range(struct io_kiocb *req, ...@@ -1888,12 +2010,25 @@ static int io_sync_file_range(struct io_kiocb *req,
return 0; return 0;
} }
static int io_sendmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
{
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, const struct io_uring_sqe *sqe = req->sqe;
struct io_kiocb **nxt, bool force_nonblock, struct user_msghdr __user *msg;
long (*fn)(struct socket *, struct user_msghdr __user *, unsigned flags;
unsigned int))
flags = READ_ONCE(sqe->msg_flags);
msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
return sendmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.iov);
#else
return 0;
#endif
}
static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock)
{ {
#if defined(CONFIG_NET)
struct socket *sock; struct socket *sock;
int ret; int ret;
...@@ -1902,7 +2037,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1902,7 +2037,9 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
sock = sock_from_file(req->file, &ret); sock = sock_from_file(req->file, &ret);
if (sock) { if (sock) {
struct user_msghdr __user *msg; struct io_async_ctx io, *copy;
struct sockaddr_storage addr;
struct msghdr *kmsg;
unsigned flags; unsigned flags;
flags = READ_ONCE(sqe->msg_flags); flags = READ_ONCE(sqe->msg_flags);
...@@ -1911,30 +2048,59 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1911,30 +2048,59 @@ static int io_send_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
else if (force_nonblock) else if (force_nonblock)
flags |= MSG_DONTWAIT; flags |= MSG_DONTWAIT;
msg = (struct user_msghdr __user *) (unsigned long) if (req->io) {
READ_ONCE(sqe->addr); kmsg = &req->io->msg.msg;
kmsg->msg_name = &addr;
} else {
kmsg = &io.msg.msg;
kmsg->msg_name = &addr;
io.msg.iov = io.msg.fast_iov;
ret = io_sendmsg_prep(req, &io);
if (ret)
goto out;
}
ret = fn(sock, msg, flags); ret = __sys_sendmsg_sock(sock, kmsg, flags);
if (force_nonblock && ret == -EAGAIN) if (force_nonblock && ret == -EAGAIN) {
copy = kmalloc(sizeof(*copy), GFP_KERNEL);
if (!copy) {
ret = -ENOMEM;
goto out;
}
memcpy(&copy->msg, &io.msg, sizeof(copy->msg));
req->io = copy;
memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe));
req->sqe = &req->io->sqe;
return ret; return ret;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
} }
out:
io_cqring_add_event(req, ret); io_cqring_add_event(req, ret);
if (ret < 0 && (req->flags & REQ_F_LINK)) if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK; req->flags |= REQ_F_FAIL_LINK;
io_put_req_find_next(req, nxt); io_put_req_find_next(req, nxt);
return 0; return 0;
} #else
return -EOPNOTSUPP;
#endif #endif
}
static int io_sendmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_recvmsg_prep(struct io_kiocb *req, struct io_async_ctx *io)
struct io_kiocb **nxt, bool force_nonblock)
{ {
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
return io_send_recvmsg(req, sqe, nxt, force_nonblock, const struct io_uring_sqe *sqe = req->sqe;
__sys_sendmsg_sock); struct user_msghdr __user *msg;
unsigned flags;
flags = READ_ONCE(sqe->msg_flags);
msg = (struct user_msghdr __user *)(unsigned long) READ_ONCE(sqe->addr);
return recvmsg_copy_msghdr(&io->msg.msg, msg, flags, &io->msg.uaddr,
&io->msg.iov);
#else #else
return -EOPNOTSUPP; return 0;
#endif #endif
} }
...@@ -1942,8 +2108,63 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1942,8 +2108,63 @@ static int io_recvmsg(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock) struct io_kiocb **nxt, bool force_nonblock)
{ {
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
return io_send_recvmsg(req, sqe, nxt, force_nonblock, struct socket *sock;
__sys_recvmsg_sock); int ret;
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
sock = sock_from_file(req->file, &ret);
if (sock) {
struct user_msghdr __user *msg;
struct io_async_ctx io, *copy;
struct sockaddr_storage addr;
struct msghdr *kmsg;
unsigned flags;
flags = READ_ONCE(sqe->msg_flags);
if (flags & MSG_DONTWAIT)
req->flags |= REQ_F_NOWAIT;
else if (force_nonblock)
flags |= MSG_DONTWAIT;
msg = (struct user_msghdr __user *) (unsigned long)
READ_ONCE(sqe->addr);
if (req->io) {
kmsg = &req->io->msg.msg;
kmsg->msg_name = &addr;
} else {
kmsg = &io.msg.msg;
kmsg->msg_name = &addr;
io.msg.iov = io.msg.fast_iov;
ret = io_recvmsg_prep(req, &io);
if (ret)
goto out;
}
ret = __sys_recvmsg_sock(sock, kmsg, msg, io.msg.uaddr, flags);
if (force_nonblock && ret == -EAGAIN) {
copy = kmalloc(sizeof(*copy), GFP_KERNEL);
if (!copy) {
ret = -ENOMEM;
goto out;
}
memcpy(copy, &io, sizeof(*copy));
req->io = copy;
memcpy(&req->io->sqe, req->sqe, sizeof(*req->sqe));
req->sqe = &req->io->sqe;
return ret;
}
if (ret == -ERESTARTSYS)
ret = -EINTR;
}
out:
io_cqring_add_event(req, ret);
if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK;
io_put_req_find_next(req, nxt);
return 0;
#else #else
return -EOPNOTSUPP; return -EOPNOTSUPP;
#endif #endif
...@@ -1985,11 +2206,26 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1985,11 +2206,26 @@ static int io_accept(struct io_kiocb *req, const struct io_uring_sqe *sqe,
#endif #endif
} }
static int io_connect_prep(struct io_kiocb *req, struct io_async_ctx *io)
{
#if defined(CONFIG_NET)
const struct io_uring_sqe *sqe = req->sqe;
struct sockaddr __user *addr;
int addr_len;
addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
addr_len = READ_ONCE(sqe->addr2);
return move_addr_to_kernel(addr, addr_len, &io->connect.address);
#else
return 0;
#endif
}
static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
struct io_kiocb **nxt, bool force_nonblock) struct io_kiocb **nxt, bool force_nonblock)
{ {
#if defined(CONFIG_NET) #if defined(CONFIG_NET)
struct sockaddr __user *addr; struct io_async_ctx __io, *io;
unsigned file_flags; unsigned file_flags;
int addr_len, ret; int addr_len, ret;
...@@ -1998,15 +2234,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -1998,15 +2234,35 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags) if (sqe->ioprio || sqe->len || sqe->buf_index || sqe->rw_flags)
return -EINVAL; return -EINVAL;
addr = (struct sockaddr __user *) (unsigned long) READ_ONCE(sqe->addr);
addr_len = READ_ONCE(sqe->addr2); addr_len = READ_ONCE(sqe->addr2);
file_flags = force_nonblock ? O_NONBLOCK : 0; file_flags = force_nonblock ? O_NONBLOCK : 0;
ret = __sys_connect_file(req->file, addr, addr_len, file_flags); if (req->io) {
if (ret == -EAGAIN && force_nonblock) io = req->io;
} else {
ret = io_connect_prep(req, &__io);
if (ret)
goto out;
io = &__io;
}
ret = __sys_connect_file(req->file, &io->connect.address, addr_len,
file_flags);
if ((ret == -EAGAIN || ret == -EINPROGRESS) && force_nonblock) {
io = kmalloc(sizeof(*io), GFP_KERNEL);
if (!io) {
ret = -ENOMEM;
goto out;
}
memcpy(&io->connect, &__io.connect, sizeof(io->connect));
req->io = io;
memcpy(&io->sqe, req->sqe, sizeof(*req->sqe));
req->sqe = &io->sqe;
return -EAGAIN; return -EAGAIN;
}
if (ret == -ERESTARTSYS) if (ret == -ERESTARTSYS)
ret = -EINTR; ret = -EINTR;
out:
if (ret < 0 && (req->flags & REQ_F_LINK)) if (ret < 0 && (req->flags & REQ_F_LINK))
req->flags |= REQ_F_FAIL_LINK; req->flags |= REQ_F_FAIL_LINK;
io_cqring_add_event(req, ret); io_cqring_add_event(req, ret);
...@@ -2017,14 +2273,6 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -2017,14 +2273,6 @@ static int io_connect(struct io_kiocb *req, const struct io_uring_sqe *sqe,
#endif #endif
} }
static inline void io_poll_remove_req(struct io_kiocb *req)
{
if (!RB_EMPTY_NODE(&req->rb_node)) {
rb_erase(&req->rb_node, &req->ctx->cancel_tree);
RB_CLEAR_NODE(&req->rb_node);
}
}
static void io_poll_remove_one(struct io_kiocb *req) static void io_poll_remove_one(struct io_kiocb *req)
{ {
struct io_poll_iocb *poll = &req->poll; struct io_poll_iocb *poll = &req->poll;
...@@ -2036,36 +2284,34 @@ static void io_poll_remove_one(struct io_kiocb *req) ...@@ -2036,36 +2284,34 @@ static void io_poll_remove_one(struct io_kiocb *req)
io_queue_async_work(req); io_queue_async_work(req);
} }
spin_unlock(&poll->head->lock); spin_unlock(&poll->head->lock);
io_poll_remove_req(req); hash_del(&req->hash_node);
} }
static void io_poll_remove_all(struct io_ring_ctx *ctx) static void io_poll_remove_all(struct io_ring_ctx *ctx)
{ {
struct rb_node *node; struct hlist_node *tmp;
struct io_kiocb *req; struct io_kiocb *req;
int i;
spin_lock_irq(&ctx->completion_lock); spin_lock_irq(&ctx->completion_lock);
while ((node = rb_first(&ctx->cancel_tree)) != NULL) { for (i = 0; i < (1U << ctx->cancel_hash_bits); i++) {
req = rb_entry(node, struct io_kiocb, rb_node); struct hlist_head *list;
io_poll_remove_one(req);
list = &ctx->cancel_hash[i];
hlist_for_each_entry_safe(req, tmp, list, hash_node)
io_poll_remove_one(req);
} }
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
} }
static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr) static int io_poll_cancel(struct io_ring_ctx *ctx, __u64 sqe_addr)
{ {
struct rb_node *p, *parent = NULL; struct hlist_head *list;
struct io_kiocb *req; struct io_kiocb *req;
p = ctx->cancel_tree.rb_node; list = &ctx->cancel_hash[hash_long(sqe_addr, ctx->cancel_hash_bits)];
while (p) { hlist_for_each_entry(req, list, hash_node) {
parent = p; if (sqe_addr == req->user_data) {
req = rb_entry(parent, struct io_kiocb, rb_node);
if (sqe_addr < req->user_data) {
p = p->rb_left;
} else if (sqe_addr > req->user_data) {
p = p->rb_right;
} else {
io_poll_remove_one(req); io_poll_remove_one(req);
return 0; return 0;
} }
...@@ -2147,7 +2393,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr) ...@@ -2147,7 +2393,7 @@ static void io_poll_complete_work(struct io_wq_work **workptr)
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
return; return;
} }
io_poll_remove_req(req); hash_del(&req->hash_node);
io_poll_complete(req, mask, ret); io_poll_complete(req, mask, ret);
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
...@@ -2182,7 +2428,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, ...@@ -2182,7 +2428,7 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
* for finalizing the request, mark us as having grabbed that already. * for finalizing the request, mark us as having grabbed that already.
*/ */
if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) { if (mask && spin_trylock_irqsave(&ctx->completion_lock, flags)) {
io_poll_remove_req(req); hash_del(&req->hash_node);
io_poll_complete(req, mask, 0); io_poll_complete(req, mask, 0);
req->flags |= REQ_F_COMP_LOCKED; req->flags |= REQ_F_COMP_LOCKED;
io_put_req(req); io_put_req(req);
...@@ -2220,20 +2466,10 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, ...@@ -2220,20 +2466,10 @@ static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head,
static void io_poll_req_insert(struct io_kiocb *req) static void io_poll_req_insert(struct io_kiocb *req)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct rb_node **p = &ctx->cancel_tree.rb_node; struct hlist_head *list;
struct rb_node *parent = NULL;
struct io_kiocb *tmp; list = &ctx->cancel_hash[hash_long(req->user_data, ctx->cancel_hash_bits)];
hlist_add_head(&req->hash_node, list);
while (*p) {
parent = *p;
tmp = rb_entry(parent, struct io_kiocb, rb_node);
if (req->user_data < tmp->user_data)
p = &(*p)->rb_left;
else
p = &(*p)->rb_right;
}
rb_link_node(&req->rb_node, parent, p);
rb_insert_color(&req->rb_node, &ctx->cancel_tree);
} }
static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
...@@ -2257,11 +2493,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -2257,11 +2493,11 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe,
if (!poll->wait) if (!poll->wait)
return -ENOMEM; return -ENOMEM;
req->sqe = NULL; req->io = NULL;
INIT_IO_WORK(&req->work, io_poll_complete_work); INIT_IO_WORK(&req->work, io_poll_complete_work);
events = READ_ONCE(sqe->poll_events); events = READ_ONCE(sqe->poll_events);
poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP;
RB_CLEAR_NODE(&req->rb_node); INIT_HLIST_NODE(&req->hash_node);
poll->head = NULL; poll->head = NULL;
poll->done = false; poll->done = false;
...@@ -2368,7 +2604,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data) ...@@ -2368,7 +2604,7 @@ static int io_timeout_cancel(struct io_ring_ctx *ctx, __u64 user_data)
if (ret == -ENOENT) if (ret == -ENOENT)
return ret; return ret;
ret = hrtimer_try_to_cancel(&req->timeout.data->timer); ret = hrtimer_try_to_cancel(&req->io->timeout.timer);
if (ret == -1) if (ret == -1)
return -EALREADY; return -EALREADY;
...@@ -2410,7 +2646,8 @@ static int io_timeout_remove(struct io_kiocb *req, ...@@ -2410,7 +2646,8 @@ static int io_timeout_remove(struct io_kiocb *req,
return 0; return 0;
} }
static int io_timeout_setup(struct io_kiocb *req) static int io_timeout_prep(struct io_kiocb *req, struct io_async_ctx *io,
bool is_timeout_link)
{ {
const struct io_uring_sqe *sqe = req->sqe; const struct io_uring_sqe *sqe = req->sqe;
struct io_timeout_data *data; struct io_timeout_data *data;
...@@ -2420,15 +2657,14 @@ static int io_timeout_setup(struct io_kiocb *req) ...@@ -2420,15 +2657,14 @@ static int io_timeout_setup(struct io_kiocb *req)
return -EINVAL; return -EINVAL;
if (sqe->ioprio || sqe->buf_index || sqe->len != 1) if (sqe->ioprio || sqe->buf_index || sqe->len != 1)
return -EINVAL; return -EINVAL;
if (sqe->off && is_timeout_link)
return -EINVAL;
flags = READ_ONCE(sqe->timeout_flags); flags = READ_ONCE(sqe->timeout_flags);
if (flags & ~IORING_TIMEOUT_ABS) if (flags & ~IORING_TIMEOUT_ABS)
return -EINVAL; return -EINVAL;
data = kzalloc(sizeof(struct io_timeout_data), GFP_KERNEL); data = &io->timeout;
if (!data)
return -ENOMEM;
data->req = req; data->req = req;
req->timeout.data = data;
req->flags |= REQ_F_TIMEOUT; req->flags |= REQ_F_TIMEOUT;
if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr))) if (get_timespec64(&data->ts, u64_to_user_ptr(sqe->addr)))
...@@ -2440,6 +2676,7 @@ static int io_timeout_setup(struct io_kiocb *req) ...@@ -2440,6 +2676,7 @@ static int io_timeout_setup(struct io_kiocb *req)
data->mode = HRTIMER_MODE_REL; data->mode = HRTIMER_MODE_REL;
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode); hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
req->io = io;
return 0; return 0;
} }
...@@ -2448,16 +2685,24 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -2448,16 +2685,24 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
unsigned count; unsigned count;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_timeout_data *data; struct io_timeout_data *data;
struct io_async_ctx *io;
struct list_head *entry; struct list_head *entry;
unsigned span = 0; unsigned span = 0;
int ret;
ret = io_timeout_setup(req); io = req->io;
/* common setup allows flags (like links) set, we don't */ if (!io) {
if (!ret && sqe->flags) int ret;
ret = -EINVAL;
if (ret) io = kmalloc(sizeof(*io), GFP_KERNEL);
return ret; if (!io)
return -ENOMEM;
ret = io_timeout_prep(req, io, false);
if (ret) {
kfree(io);
return ret;
}
}
data = &req->io->timeout;
/* /*
* sqe->off holds how many events that need to occur for this * sqe->off holds how many events that need to occur for this
...@@ -2473,7 +2718,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -2473,7 +2718,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
} }
req->sequence = ctx->cached_sq_head + count - 1; req->sequence = ctx->cached_sq_head + count - 1;
req->timeout.data->seq_offset = count; data->seq_offset = count;
/* /*
* Insertion sort, ensuring the first entry in the list is always * Insertion sort, ensuring the first entry in the list is always
...@@ -2484,7 +2729,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -2484,7 +2729,7 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list); struct io_kiocb *nxt = list_entry(entry, struct io_kiocb, list);
unsigned nxt_sq_head; unsigned nxt_sq_head;
long long tmp, tmp_nxt; long long tmp, tmp_nxt;
u32 nxt_offset = nxt->timeout.data->seq_offset; u32 nxt_offset = nxt->io->timeout.seq_offset;
if (nxt->flags & REQ_F_TIMEOUT_NOSEQ) if (nxt->flags & REQ_F_TIMEOUT_NOSEQ)
continue; continue;
...@@ -2517,7 +2762,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe) ...@@ -2517,7 +2762,6 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->sequence -= span; req->sequence -= span;
add: add:
list_add(&req->list, entry); list_add(&req->list, entry);
data = req->timeout.data;
data->timer.function = io_timeout_fn; data->timer.function = io_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode); hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), data->mode);
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
...@@ -2598,30 +2842,76 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe, ...@@ -2598,30 +2842,76 @@ static int io_async_cancel(struct io_kiocb *req, const struct io_uring_sqe *sqe,
return 0; return 0;
} }
static int io_req_defer_prep(struct io_kiocb *req, struct io_async_ctx *io)
{
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct iov_iter iter;
ssize_t ret;
memcpy(&io->sqe, req->sqe, sizeof(io->sqe));
req->sqe = &io->sqe;
switch (io->sqe.opcode) {
case IORING_OP_READV:
case IORING_OP_READ_FIXED:
ret = io_read_prep(req, &iovec, &iter, true);
break;
case IORING_OP_WRITEV:
case IORING_OP_WRITE_FIXED:
ret = io_write_prep(req, &iovec, &iter, true);
break;
case IORING_OP_SENDMSG:
ret = io_sendmsg_prep(req, io);
break;
case IORING_OP_RECVMSG:
ret = io_recvmsg_prep(req, io);
break;
case IORING_OP_CONNECT:
ret = io_connect_prep(req, io);
break;
case IORING_OP_TIMEOUT:
return io_timeout_prep(req, io, false);
case IORING_OP_LINK_TIMEOUT:
return io_timeout_prep(req, io, true);
default:
req->io = io;
return 0;
}
if (ret < 0)
return ret;
req->io = io;
io_req_map_io(req, ret, iovec, inline_vecs, &iter);
return 0;
}
static int io_req_defer(struct io_kiocb *req) static int io_req_defer(struct io_kiocb *req)
{ {
struct io_uring_sqe *sqe_copy;
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
struct io_async_ctx *io;
int ret;
/* Still need defer if there is pending req in defer list. */ /* Still need defer if there is pending req in defer list. */
if (!req_need_defer(req) && list_empty(&ctx->defer_list)) if (!req_need_defer(req) && list_empty(&ctx->defer_list))
return 0; return 0;
sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); io = kmalloc(sizeof(*io), GFP_KERNEL);
if (!sqe_copy) if (!io)
return -EAGAIN; return -EAGAIN;
ret = io_req_defer_prep(req, io);
if (ret < 0) {
kfree(io);
return ret;
}
spin_lock_irq(&ctx->completion_lock); spin_lock_irq(&ctx->completion_lock);
if (!req_need_defer(req) && list_empty(&ctx->defer_list)) { if (!req_need_defer(req) && list_empty(&ctx->defer_list)) {
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
kfree(sqe_copy);
return 0; return 0;
} }
memcpy(sqe_copy, req->sqe, sizeof(*sqe_copy));
req->flags |= REQ_F_FREE_SQE;
req->sqe = sqe_copy;
trace_io_uring_defer(ctx, req, req->user_data); trace_io_uring_defer(ctx, req, req->user_data);
list_add_tail(&req->list, &ctx->defer_list); list_add_tail(&req->list, &ctx->defer_list);
spin_unlock_irq(&ctx->completion_lock); spin_unlock_irq(&ctx->completion_lock);
...@@ -2876,10 +3166,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer) ...@@ -2876,10 +3166,11 @@ static enum hrtimer_restart io_link_timeout_fn(struct hrtimer *timer)
* We don't expect the list to be empty, that will only happen if we * We don't expect the list to be empty, that will only happen if we
* race with the completion of the linked work. * race with the completion of the linked work.
*/ */
if (!list_empty(&req->list)) { if (!list_empty(&req->link_list)) {
prev = list_entry(req->list.prev, struct io_kiocb, link_list); prev = list_entry(req->link_list.prev, struct io_kiocb,
link_list);
if (refcount_inc_not_zero(&prev->refs)) { if (refcount_inc_not_zero(&prev->refs)) {
list_del_init(&req->list); list_del_init(&req->link_list);
prev->flags &= ~REQ_F_LINK_TIMEOUT; prev->flags &= ~REQ_F_LINK_TIMEOUT;
} else } else
prev = NULL; prev = NULL;
...@@ -2909,8 +3200,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req) ...@@ -2909,8 +3200,8 @@ static void io_queue_linked_timeout(struct io_kiocb *req)
* we got a chance to setup the timer * we got a chance to setup the timer
*/ */
spin_lock_irq(&ctx->completion_lock); spin_lock_irq(&ctx->completion_lock);
if (!list_empty(&req->list)) { if (!list_empty(&req->link_list)) {
struct io_timeout_data *data = req->timeout.data; struct io_timeout_data *data = &req->io->timeout;
data->timer.function = io_link_timeout_fn; data->timer.function = io_link_timeout_fn;
hrtimer_start(&data->timer, timespec64_to_ktime(data->ts), hrtimer_start(&data->timer, timespec64_to_ktime(data->ts),
...@@ -2929,7 +3220,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) ...@@ -2929,7 +3220,8 @@ static struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req)
if (!(req->flags & REQ_F_LINK)) if (!(req->flags & REQ_F_LINK))
return NULL; return NULL;
nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb, list); nxt = list_first_entry_or_null(&req->link_list, struct io_kiocb,
link_list);
if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT) if (!nxt || nxt->sqe->opcode != IORING_OP_LINK_TIMEOUT)
return NULL; return NULL;
...@@ -2953,15 +3245,6 @@ static void __io_queue_sqe(struct io_kiocb *req) ...@@ -2953,15 +3245,6 @@ static void __io_queue_sqe(struct io_kiocb *req)
*/ */
if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) || if (ret == -EAGAIN && (!(req->flags & REQ_F_NOWAIT) ||
(req->flags & REQ_F_MUST_PUNT))) { (req->flags & REQ_F_MUST_PUNT))) {
struct io_uring_sqe *sqe_copy;
sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy)
goto err;
req->sqe = sqe_copy;
req->flags |= REQ_F_FREE_SQE;
if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) { if (req->work.flags & IO_WQ_WORK_NEEDS_FILES) {
ret = io_grab_files(req); ret = io_grab_files(req);
if (ret) if (ret)
...@@ -3030,7 +3313,7 @@ static inline void io_queue_link_head(struct io_kiocb *req) ...@@ -3030,7 +3313,7 @@ static inline void io_queue_link_head(struct io_kiocb *req)
#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK) #define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK)
static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, static bool io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
struct io_kiocb **link) struct io_kiocb **link)
{ {
struct io_ring_ctx *ctx = req->ctx; struct io_ring_ctx *ctx = req->ctx;
...@@ -3049,7 +3332,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, ...@@ -3049,7 +3332,7 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
err_req: err_req:
io_cqring_add_event(req, ret); io_cqring_add_event(req, ret);
io_double_put_req(req); io_double_put_req(req);
return; return false;
} }
/* /*
...@@ -3061,32 +3344,25 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, ...@@ -3061,32 +3344,25 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
*/ */
if (*link) { if (*link) {
struct io_kiocb *prev = *link; struct io_kiocb *prev = *link;
struct io_uring_sqe *sqe_copy; struct io_async_ctx *io;
if (req->sqe->flags & IOSQE_IO_DRAIN) if (req->sqe->flags & IOSQE_IO_DRAIN)
(*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN; (*link)->flags |= REQ_F_DRAIN_LINK | REQ_F_IO_DRAIN;
if (READ_ONCE(req->sqe->opcode) == IORING_OP_LINK_TIMEOUT) { io = kmalloc(sizeof(*io), GFP_KERNEL);
ret = io_timeout_setup(req); if (!io) {
/* common setup allows offset being set, we don't */
if (!ret && req->sqe->off)
ret = -EINVAL;
if (ret) {
prev->flags |= REQ_F_FAIL_LINK;
goto err_req;
}
}
sqe_copy = kmemdup(req->sqe, sizeof(*sqe_copy), GFP_KERNEL);
if (!sqe_copy) {
ret = -EAGAIN; ret = -EAGAIN;
goto err_req; goto err_req;
} }
req->sqe = sqe_copy; ret = io_req_defer_prep(req, io);
req->flags |= REQ_F_FREE_SQE; if (ret) {
kfree(io);
prev->flags |= REQ_F_FAIL_LINK;
goto err_req;
}
trace_io_uring_link(ctx, req, prev); trace_io_uring_link(ctx, req, prev);
list_add_tail(&req->list, &prev->link_list); list_add_tail(&req->link_list, &prev->link_list);
} else if (req->sqe->flags & IOSQE_IO_LINK) { } else if (req->sqe->flags & IOSQE_IO_LINK) {
req->flags |= REQ_F_LINK; req->flags |= REQ_F_LINK;
...@@ -3095,6 +3371,8 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state, ...@@ -3095,6 +3371,8 @@ static void io_submit_sqe(struct io_kiocb *req, struct io_submit_state *state,
} else { } else {
io_queue_sqe(req); io_queue_sqe(req);
} }
return true;
} }
/* /*
...@@ -3113,7 +3391,7 @@ static void io_submit_state_end(struct io_submit_state *state) ...@@ -3113,7 +3391,7 @@ static void io_submit_state_end(struct io_submit_state *state)
* Start submission side cache. * Start submission side cache.
*/ */
static void io_submit_state_start(struct io_submit_state *state, static void io_submit_state_start(struct io_submit_state *state,
struct io_ring_ctx *ctx, unsigned max_ios) unsigned int max_ios)
{ {
blk_start_plug(&state->plug); blk_start_plug(&state->plug);
state->free_reqs = 0; state->free_reqs = 0;
...@@ -3197,7 +3475,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, ...@@ -3197,7 +3475,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
return -EBUSY; return -EBUSY;
if (nr > IO_PLUG_THRESHOLD) { if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, ctx, nr); io_submit_state_start(&state, nr);
statep = &state; statep = &state;
} }
...@@ -3224,6 +3502,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, ...@@ -3224,6 +3502,7 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
} }
} }
submitted++;
sqe_flags = req->sqe->flags; sqe_flags = req->sqe->flags;
req->ring_file = ring_file; req->ring_file = ring_file;
...@@ -3233,9 +3512,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, ...@@ -3233,9 +3512,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
req->needs_fixed_file = async; req->needs_fixed_file = async;
trace_io_uring_submit_sqe(ctx, req->sqe->user_data, trace_io_uring_submit_sqe(ctx, req->sqe->user_data,
true, async); true, async);
io_submit_sqe(req, statep, &link); if (!io_submit_sqe(req, statep, &link))
submitted++; break;
/* /*
* If previous wasn't linked and we have a linked command, * If previous wasn't linked and we have a linked command,
* that's the end of the chain. Submit the previous link. * that's the end of the chain. Submit the previous link.
...@@ -4363,6 +4641,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) ...@@ -4363,6 +4641,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
free_uid(ctx->user); free_uid(ctx->user);
put_cred(ctx->creds); put_cred(ctx->creds);
kfree(ctx->completions); kfree(ctx->completions);
kfree(ctx->cancel_hash);
kmem_cache_free(req_cachep, ctx->fallback_req); kmem_cache_free(req_cachep, ctx->fallback_req);
kfree(ctx); kfree(ctx);
} }
...@@ -4759,7 +5038,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ...@@ -4759,7 +5038,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
ctx->compat = in_compat_syscall(); ctx->compat = in_compat_syscall();
ctx->account_mem = account_mem; ctx->account_mem = account_mem;
ctx->user = user; ctx->user = user;
ctx->creds = prepare_creds(); ctx->creds = get_current_cred();
ret = io_allocate_scq_urings(ctx, p); ret = io_allocate_scq_urings(ctx, p);
if (ret) if (ret)
...@@ -4794,7 +5073,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) ...@@ -4794,7 +5073,8 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
if (ret < 0) if (ret < 0)
goto err; goto err;
p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP; p->features = IORING_FEAT_SINGLE_MMAP | IORING_FEAT_NODROP |
IORING_FEAT_SUBMIT_STABLE;
trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags); trace_io_uring_create(ret, ctx, p->sq_entries, p->cq_entries, p->flags);
return ret; return ret;
err: err:
......
...@@ -357,8 +357,7 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, ...@@ -357,8 +357,7 @@ typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx,
#define BLK_ALL_ZONES ((unsigned int)-1) #define BLK_ALL_ZONES ((unsigned int)-1)
int blkdev_report_zones(struct block_device *bdev, sector_t sector, int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data); unsigned int nr_zones, report_zones_cb cb, void *data);
unsigned int blkdev_nr_zones(struct gendisk *disk);
extern unsigned int blkdev_nr_zones(struct block_device *bdev);
extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
sector_t sectors, sector_t nr_sectors, sector_t sectors, sector_t nr_sectors,
gfp_t gfp_mask); gfp_t gfp_mask);
...@@ -371,12 +370,7 @@ extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, ...@@ -371,12 +370,7 @@ extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
#else /* CONFIG_BLK_DEV_ZONED */ #else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blkdev_nr_zones(struct block_device *bdev) static inline unsigned int blkdev_nr_zones(struct gendisk *disk)
{
return 0;
}
static inline int blk_revalidate_disk_zones(struct gendisk *disk)
{ {
return 0; return 0;
} }
...@@ -504,9 +498,9 @@ struct request_queue { ...@@ -504,9 +498,9 @@ struct request_queue {
/* /*
* Zoned block device information for request dispatch control. * Zoned block device information for request dispatch control.
* nr_zones is the total number of zones of the device. This is always * nr_zones is the total number of zones of the device. This is always
* 0 for regular block devices. seq_zones_bitmap is a bitmap of nr_zones * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
* bits which indicates if a zone is conventional (bit clear) or * bits which indicates if a zone is conventional (bit set) or
* sequential (bit set). seq_zones_wlock is a bitmap of nr_zones * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
* bits which indicates if a zone is write locked, that is, if a write * bits which indicates if a zone is write locked, that is, if a write
* request targeting the zone was dispatched. All three fields are * request targeting the zone was dispatched. All three fields are
* initialized by the low level device driver (e.g. scsi/sd.c). * initialized by the low level device driver (e.g. scsi/sd.c).
...@@ -519,7 +513,7 @@ struct request_queue { ...@@ -519,7 +513,7 @@ struct request_queue {
* blk_mq_unfreeze_queue(). * blk_mq_unfreeze_queue().
*/ */
unsigned int nr_zones; unsigned int nr_zones;
unsigned long *seq_zones_bitmap; unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock; unsigned long *seq_zones_wlock;
#endif /* CONFIG_BLK_DEV_ZONED */ #endif /* CONFIG_BLK_DEV_ZONED */
...@@ -724,9 +718,11 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q, ...@@ -724,9 +718,11 @@ static inline unsigned int blk_queue_zone_no(struct request_queue *q,
static inline bool blk_queue_zone_is_seq(struct request_queue *q, static inline bool blk_queue_zone_is_seq(struct request_queue *q,
sector_t sector) sector_t sector)
{ {
if (!blk_queue_is_zoned(q) || !q->seq_zones_bitmap) if (!blk_queue_is_zoned(q))
return false; return false;
return test_bit(blk_queue_zone_no(q, sector), q->seq_zones_bitmap); if (!q->conv_zones_bitmap)
return true;
return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap);
} }
#else /* CONFIG_BLK_DEV_ZONED */ #else /* CONFIG_BLK_DEV_ZONED */
static inline unsigned int blk_queue_nr_zones(struct request_queue *q) static inline unsigned int blk_queue_nr_zones(struct request_queue *q)
......
...@@ -87,26 +87,24 @@ struct bvec_iter_all { ...@@ -87,26 +87,24 @@ struct bvec_iter_all {
static inline bool bvec_iter_advance(const struct bio_vec *bv, static inline bool bvec_iter_advance(const struct bio_vec *bv,
struct bvec_iter *iter, unsigned bytes) struct bvec_iter *iter, unsigned bytes)
{ {
unsigned int idx = iter->bi_idx;
if (WARN_ONCE(bytes > iter->bi_size, if (WARN_ONCE(bytes > iter->bi_size,
"Attempted to advance past end of bvec iter\n")) { "Attempted to advance past end of bvec iter\n")) {
iter->bi_size = 0; iter->bi_size = 0;
return false; return false;
} }
while (bytes) { iter->bi_size -= bytes;
const struct bio_vec *cur = bv + iter->bi_idx; bytes += iter->bi_bvec_done;
unsigned len = min3(bytes, iter->bi_size,
cur->bv_len - iter->bi_bvec_done);
bytes -= len;
iter->bi_size -= len;
iter->bi_bvec_done += len;
if (iter->bi_bvec_done == cur->bv_len) { while (bytes && bytes >= bv[idx].bv_len) {
iter->bi_bvec_done = 0; bytes -= bv[idx].bv_len;
iter->bi_idx++; idx++;
}
} }
iter->bi_idx = idx;
iter->bi_bvec_done = bytes;
return true; return true;
} }
......
...@@ -378,12 +378,19 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, ...@@ -378,12 +378,19 @@ extern int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg,
extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, extern int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg,
unsigned int vlen, unsigned int flags, unsigned int vlen, unsigned int flags,
bool forbid_cmsg_compat); bool forbid_cmsg_compat);
extern long __sys_sendmsg_sock(struct socket *sock, extern long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
struct user_msghdr __user *msg,
unsigned int flags); unsigned int flags);
extern long __sys_recvmsg_sock(struct socket *sock, extern long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
struct user_msghdr __user *msg, struct user_msghdr __user *umsg,
struct sockaddr __user *uaddr,
unsigned int flags); unsigned int flags);
extern int sendmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct iovec **iov);
extern int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr,
struct iovec **iov);
/* helpers which do the actual work for syscalls */ /* helpers which do the actual work for syscalls */
extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size, extern int __sys_recvfrom(int fd, void __user *ubuf, size_t size,
...@@ -399,9 +406,8 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr, ...@@ -399,9 +406,8 @@ extern int __sys_accept4(int fd, struct sockaddr __user *upeer_sockaddr,
int __user *upeer_addrlen, int flags); int __user *upeer_addrlen, int flags);
extern int __sys_socket(int family, int type, int protocol); extern int __sys_socket(int family, int type, int protocol);
extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen); extern int __sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen);
extern int __sys_connect_file(struct file *file, extern int __sys_connect_file(struct file *file, struct sockaddr_storage *addr,
struct sockaddr __user *uservaddr, int addrlen, int addrlen, int file_flags);
int file_flags);
extern int __sys_connect(int fd, struct sockaddr __user *uservaddr, extern int __sys_connect(int fd, struct sockaddr __user *uservaddr,
int addrlen); int addrlen);
extern int __sys_listen(int fd, int backlog); extern int __sys_listen(int fd, int backlog);
......
...@@ -157,6 +157,7 @@ struct io_uring_params { ...@@ -157,6 +157,7 @@ struct io_uring_params {
*/ */
#define IORING_FEAT_SINGLE_MMAP (1U << 0) #define IORING_FEAT_SINGLE_MMAP (1U << 0)
#define IORING_FEAT_NODROP (1U << 1) #define IORING_FEAT_NODROP (1U << 1)
#define IORING_FEAT_SUBMIT_STABLE (1U << 2)
/* /*
* io_uring_register(2) opcodes and arguments * io_uring_register(2) opcodes and arguments
......
...@@ -1826,26 +1826,22 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr, ...@@ -1826,26 +1826,22 @@ SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
* include the -EINPROGRESS status for such sockets. * include the -EINPROGRESS status for such sockets.
*/ */
int __sys_connect_file(struct file *file, struct sockaddr __user *uservaddr, int __sys_connect_file(struct file *file, struct sockaddr_storage *address,
int addrlen, int file_flags) int addrlen, int file_flags)
{ {
struct socket *sock; struct socket *sock;
struct sockaddr_storage address;
int err; int err;
sock = sock_from_file(file, &err); sock = sock_from_file(file, &err);
if (!sock) if (!sock)
goto out; goto out;
err = move_addr_to_kernel(uservaddr, addrlen, &address);
if (err < 0)
goto out;
err = err =
security_socket_connect(sock, (struct sockaddr *)&address, addrlen); security_socket_connect(sock, (struct sockaddr *)address, addrlen);
if (err) if (err)
goto out; goto out;
err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen, err = sock->ops->connect(sock, (struct sockaddr *)address, addrlen,
sock->file->f_flags | file_flags); sock->file->f_flags | file_flags);
out: out:
return err; return err;
...@@ -1858,7 +1854,11 @@ int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) ...@@ -1858,7 +1854,11 @@ int __sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen)
f = fdget(fd); f = fdget(fd);
if (f.file) { if (f.file) {
ret = __sys_connect_file(f.file, uservaddr, addrlen, 0); struct sockaddr_storage address;
ret = move_addr_to_kernel(uservaddr, addrlen, &address);
if (!ret)
ret = __sys_connect_file(f.file, &address, addrlen, 0);
if (f.flags) if (f.flags)
fput(f.file); fput(f.file);
} }
...@@ -2346,9 +2346,9 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, ...@@ -2346,9 +2346,9 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys,
return err; return err;
} }
static int sendmsg_copy_msghdr(struct msghdr *msg, int sendmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags, struct user_msghdr __user *umsg, unsigned flags,
struct iovec **iov) struct iovec **iov)
{ {
int err; int err;
...@@ -2390,27 +2390,14 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, ...@@ -2390,27 +2390,14 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
/* /*
* BSD sendmsg interface * BSD sendmsg interface
*/ */
long __sys_sendmsg_sock(struct socket *sock, struct user_msghdr __user *umsg, long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags) unsigned int flags)
{ {
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
struct sockaddr_storage address;
struct msghdr msg = { .msg_name = &address };
ssize_t err;
err = sendmsg_copy_msghdr(&msg, umsg, flags, &iov);
if (err)
return err;
/* disallow ancillary data requests from this path */ /* disallow ancillary data requests from this path */
if (msg.msg_control || msg.msg_controllen) { if (msg->msg_control || msg->msg_controllen)
err = -EINVAL; return -EINVAL;
goto out;
}
err = ____sys_sendmsg(sock, &msg, flags, NULL, 0); return ____sys_sendmsg(sock, msg, flags, NULL, 0);
out:
kfree(iov);
return err;
} }
long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
...@@ -2516,10 +2503,10 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg, ...@@ -2516,10 +2503,10 @@ SYSCALL_DEFINE4(sendmmsg, int, fd, struct mmsghdr __user *, mmsg,
return __sys_sendmmsg(fd, mmsg, vlen, flags, true); return __sys_sendmmsg(fd, mmsg, vlen, flags, true);
} }
static int recvmsg_copy_msghdr(struct msghdr *msg, int recvmsg_copy_msghdr(struct msghdr *msg,
struct user_msghdr __user *umsg, unsigned flags, struct user_msghdr __user *umsg, unsigned flags,
struct sockaddr __user **uaddr, struct sockaddr __user **uaddr,
struct iovec **iov) struct iovec **iov)
{ {
ssize_t err; ssize_t err;
...@@ -2609,28 +2596,15 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg, ...@@ -2609,28 +2596,15 @@ static int ___sys_recvmsg(struct socket *sock, struct user_msghdr __user *msg,
* BSD recvmsg interface * BSD recvmsg interface
*/ */
long __sys_recvmsg_sock(struct socket *sock, struct user_msghdr __user *umsg, long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg,
unsigned int flags) struct user_msghdr __user *umsg,
struct sockaddr __user *uaddr, unsigned int flags)
{ {
struct iovec iovstack[UIO_FASTIOV], *iov = iovstack;
struct sockaddr_storage address;
struct msghdr msg = { .msg_name = &address };
struct sockaddr __user *uaddr;
ssize_t err;
err = recvmsg_copy_msghdr(&msg, umsg, flags, &uaddr, &iov);
if (err)
return err;
/* disallow ancillary data requests from this path */ /* disallow ancillary data requests from this path */
if (msg.msg_control || msg.msg_controllen) { if (msg->msg_control || msg->msg_controllen)
err = -EINVAL; return -EINVAL;
goto out;
}
err = ____sys_recvmsg(sock, &msg, umsg, uaddr, flags, 0); return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0);
out:
kfree(iov);
return err;
} }
long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags, long __sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned int flags,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment