Commit ac8f7a02 authored by Jens Axboe's avatar Jens Axboe

Merge branch 'for-5.10/block' into for-5.10/drivers

* for-5.10/block: (140 commits)
  bdi: replace BDI_CAP_NO_{WRITEBACK,ACCT_DIRTY} with a single flag
  bdi: invert BDI_CAP_NO_ACCT_WB
  bdi: replace BDI_CAP_STABLE_WRITES with a queue and a sb flag
  mm: use SWP_SYNCHRONOUS_IO more intelligently
  bdi: remove BDI_CAP_SYNCHRONOUS_IO
  bdi: remove BDI_CAP_CGROUP_WRITEBACK
  block: lift setting the readahead size into the block layer
  md: update the optimal I/O size on reshape
  bdi: initialize ->ra_pages and ->io_pages in bdi_init
  aoe: set an optimal I/O size
  bcache: inherit the optimal I/O size
  drbd: remove dead code in device_to_statistics
  fs: remove the unused SB_I_MULTIROOT flag
  block: mark blkdev_get static
  PM: mm: cleanup swsusp_swap_check
  mm: split swap_type_of
  PM: rewrite is_hibernate_resume_dev to not require an inode
  mm: cleanup claim_swapfile
  ocfs2: cleanup o2hb_region_dev_store
  dasd: cleanup dasd_scan_partitions
  ...
parents 805c6d3c f56753ac
...@@ -488,9 +488,6 @@ getgeo: no ...@@ -488,9 +488,6 @@ getgeo: no
swap_slot_free_notify: no (see below) swap_slot_free_notify: no (see below)
======================= =================== ======================= ===================
unlock_native_capacity and revalidate_disk are called only from
check_disk_change().
swap_slot_free_notify is called with swap_lock and sometimes the page lock swap_slot_free_notify is called with swap_lock and sometimes the page lock
held. held.
......
...@@ -161,8 +161,6 @@ config BLK_WBT_MQ ...@@ -161,8 +161,6 @@ config BLK_WBT_MQ
depends on BLK_WBT depends on BLK_WBT
help help
Enable writeback throttling by default on multiqueue devices. Enable writeback throttling by default on multiqueue devices.
Multiqueue currently doesn't have support for IO scheduling,
enabling this option is recommended.
config BLK_DEBUG_FS config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs" bool "Block layer debugging information in debugfs"
......
...@@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) ...@@ -4640,6 +4640,9 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{ {
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
if (!atomic_read(&hctx->elevator_queued))
return false;
/* /*
* Avoiding lock: a race on bfqd->busy_queues should cause at * Avoiding lock: a race on bfqd->busy_queues should cause at
* most a call to dispatch for nothing * most a call to dispatch for nothing
...@@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, ...@@ -5554,6 +5557,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist); rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist); list_del_init(&rq->queuelist);
bfq_insert_request(hctx, rq, at_head); bfq_insert_request(hctx, rq, at_head);
atomic_inc(&hctx->elevator_queued);
} }
} }
...@@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq) ...@@ -5921,6 +5925,7 @@ static void bfq_finish_requeue_request(struct request *rq)
bfq_completed_request(bfqq, bfqd); bfq_completed_request(bfqq, bfqd);
bfq_finish_requeue_request_body(bfqq); bfq_finish_requeue_request_body(bfqq);
atomic_dec(&rq->mq_hctx->elevator_queued);
spin_unlock_irqrestore(&bfqd->lock, flags); spin_unlock_irqrestore(&bfqd->lock, flags);
} else { } else {
...@@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) ...@@ -6360,8 +6365,8 @@ static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
struct blk_mq_tags *tags = hctx->sched_tags; struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow; unsigned int min_shallow;
min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
} }
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
......
...@@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work) ...@@ -119,6 +119,8 @@ static void blkg_async_bio_workfn(struct work_struct *work)
async_bio_work); async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST; struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio; struct bio *bio;
struct blk_plug plug;
bool need_plug = false;
/* as long as there are pending bios, @blkg can't go away */ /* as long as there are pending bios, @blkg can't go away */
spin_lock_bh(&blkg->async_bio_lock); spin_lock_bh(&blkg->async_bio_lock);
...@@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work) ...@@ -126,8 +128,15 @@ static void blkg_async_bio_workfn(struct work_struct *work)
bio_list_init(&blkg->async_bios); bio_list_init(&blkg->async_bios);
spin_unlock_bh(&blkg->async_bio_lock); spin_unlock_bh(&blkg->async_bio_lock);
/* start plug only when bio_list contains at least 2 bios */
if (bios.head && bios.head->bi_next) {
need_plug = true;
blk_start_plug(&plug);
}
while ((bio = bio_list_pop(&bios))) while ((bio = bio_list_pop(&bios)))
submit_bio(bio); submit_bio(bio);
if (need_plug)
blk_finish_plug(&plug);
} }
/** /**
...@@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) ...@@ -1613,16 +1622,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{ {
unsigned long pflags; unsigned long pflags;
bool clamp;
u64 now = ktime_to_ns(ktime_get()); u64 now = ktime_to_ns(ktime_get());
u64 exp; u64 exp;
u64 delay_nsec = 0; u64 delay_nsec = 0;
int tok; int tok;
while (blkg->parent) { while (blkg->parent) {
if (atomic_read(&blkg->use_delay)) { int use_delay = atomic_read(&blkg->use_delay);
if (use_delay) {
u64 this_delay;
blkcg_scale_delay(blkg, now); blkcg_scale_delay(blkg, now);
delay_nsec = max_t(u64, delay_nsec, this_delay = atomic64_read(&blkg->delay_nsec);
atomic64_read(&blkg->delay_nsec)); if (this_delay > delay_nsec) {
delay_nsec = this_delay;
clamp = use_delay > 0;
}
} }
blkg = blkg->parent; blkg = blkg->parent;
} }
...@@ -1634,10 +1651,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) ...@@ -1634,10 +1651,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
* Let's not sleep for all eternity if we've amassed a huge delay. * Let's not sleep for all eternity if we've amassed a huge delay.
* Swapping or metadata IO can accumulate 10's of seconds worth of * Swapping or metadata IO can accumulate 10's of seconds worth of
* delay, and we want userspace to be able to do _something_ so cap the * delay, and we want userspace to be able to do _something_ so cap the
* delays at 1 second. If there's 10's of seconds worth of delay then * delays at 0.25s. If there's 10's of seconds worth of delay then the
* the tasks will be delayed for 1 second for every syscall. * tasks will be delayed for 0.25 second for every syscall. If
* blkcg_set_delay() was used as indicated by negative use_delay, the
* caller is responsible for regulating the range.
*/ */
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (clamp)
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
if (use_memdelay) if (use_memdelay)
psi_memstall_enter(&pflags); psi_memstall_enter(&pflags);
......
...@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq) ...@@ -116,8 +116,8 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->__sector = (sector_t) -1; rq->__sector = (sector_t) -1;
INIT_HLIST_NODE(&rq->hash); INIT_HLIST_NODE(&rq->hash);
RB_CLEAR_NODE(&rq->rb_node); RB_CLEAR_NODE(&rq->rb_node);
rq->tag = -1; rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = -1; rq->internal_tag = BLK_MQ_NO_TAG;
rq->start_time_ns = ktime_get_ns(); rq->start_time_ns = ktime_get_ns();
rq->part = NULL; rq->part = NULL;
refcount_set(&rq->ref, 1); refcount_set(&rq->ref, 1);
...@@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id) ...@@ -538,11 +538,10 @@ struct request_queue *blk_alloc_queue(int node_id)
if (!q->stats) if (!q->stats)
goto fail_stats; goto fail_stats;
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->io_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->node = node_id; q->node = node_id;
atomic_set(&q->nr_active_requests_shared_sbitmap, 0);
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
laptop_mode_timer_fn, 0); laptop_mode_timer_fn, 0);
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
...@@ -643,162 +642,6 @@ void blk_put_request(struct request *req) ...@@ -643,162 +642,6 @@ void blk_put_request(struct request *req)
} }
EXPORT_SYMBOL(blk_put_request); EXPORT_SYMBOL(blk_put_request);
static void blk_account_io_merge_bio(struct request *req)
{
if (!blk_do_io_stat(req))
return;
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_unlock();
}
bool bio_attempt_back_merge(struct request *req, struct bio *bio,
unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_back_merge_fn(req, bio, nr_segs))
return false;
trace_block_bio_backmerge(req->q, req, bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
bio_crypt_free_ctx(bio);
blk_account_io_merge_bio(req);
return true;
}
bool bio_attempt_front_merge(struct request *req, struct bio *bio,
unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_front_merge_fn(req, bio, nr_segs))
return false;
trace_block_bio_frontmerge(req->q, req, bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
bio->bi_next = req->bio;
req->bio = bio;
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
bio_crypt_do_front_merge(req, bio);
blk_account_io_merge_bio(req);
return true;
}
bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
struct bio *bio)
{
unsigned short segments = blk_rq_nr_discard_segments(req);
if (segments >= queue_max_discard_segments(q))
goto no_merge;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
rq_qos_merge(q, req, bio);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
blk_account_io_merge_bio(req);
return true;
no_merge:
req_set_nomerge(q, req);
return false;
}
/**
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
*
* Determine whether @bio being queued on @q can be merged with a request
* on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
* Plugging coalesces IOs from the same issuer for the same purpose without
* going through @q->queue_lock. As such it's more of an issuing mechanism
* than scheduling, and the request, while may have elvpriv data, is not
* added on the elevator at this point. In addition, we don't have
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
plug = blk_mq_plug(q, bio);
if (!plug)
return false;
plug_list = &plug->mq_list;
list_for_each_entry_reverse(rq, plug_list, queuelist) {
bool merged = false;
if (rq->q == q && same_queue_rq) {
/*
* Only blk-mq multiple hardware queues case checks the
* rq in the same queue, there should be only one such
* rq in a queue
**/
*same_queue_rq = rq;
}
if (rq->q != q || !blk_rq_merge_ok(rq, bio))
continue;
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
merged = bio_attempt_back_merge(rq, bio, nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
merged = bio_attempt_front_merge(rq, bio, nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
break;
default:
break;
}
if (merged)
return true;
}
return false;
}
static void handle_bad_sector(struct bio *bio, sector_t maxsector) static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
...@@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio); ...@@ -1301,14 +1144,28 @@ EXPORT_SYMBOL(submit_bio);
* limits when retrying requests on other queues. Those requests need * limits when retrying requests on other queues. Those requests need
* to be checked against the new queue limits again during dispatch. * to be checked against the new queue limits again during dispatch.
*/ */
static int blk_cloned_rq_check_limits(struct request_queue *q, static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q,
struct request *rq) struct request *rq)
{ {
if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) { unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
if (blk_rq_sectors(rq) > max_sectors) {
/*
* SCSI device does not have a good way to return if
* Write Same/Zero is actually supported. If a device rejects
* a non-read/write command (discard, write same,etc.) the
* low-level device driver will set the relevant queue limit to
* 0 to prevent blk-lib from issuing more of the offending
* operations. Commands queued prior to the queue limit being
* reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
* errors being propagated to upper layers.
*/
if (max_sectors == 0)
return BLK_STS_NOTSUPP;
printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
__func__, blk_rq_sectors(rq), __func__, blk_rq_sectors(rq), max_sectors);
blk_queue_get_max_sectors(q, req_op(rq))); return BLK_STS_IOERR;
return -EIO;
} }
/* /*
...@@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, ...@@ -1321,10 +1178,10 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
if (rq->nr_phys_segments > queue_max_segments(q)) { if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q)); __func__, rq->nr_phys_segments, queue_max_segments(q));
return -EIO; return BLK_STS_IOERR;
} }
return 0; return BLK_STS_OK;
} }
/** /**
...@@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, ...@@ -1334,8 +1191,11 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
*/ */
blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
{ {
if (blk_cloned_rq_check_limits(q, rq)) blk_status_t ret;
return BLK_STS_IOERR;
ret = blk_cloned_rq_check_limits(q, rq);
if (ret != BLK_STS_OK)
return ret;
if (rq->rq_disk && if (rq->rq_disk &&
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
...@@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq) ...@@ -1461,10 +1321,9 @@ void blk_account_io_start(struct request *rq)
part_stat_unlock(); part_stat_unlock();
} }
unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, static unsigned long __part_start_io_acct(struct hd_struct *part,
unsigned int op) unsigned int sectors, unsigned int op)
{ {
struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op); const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies); unsigned long now = READ_ONCE(jiffies);
...@@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, ...@@ -1477,12 +1336,26 @@ unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
return now; return now;
} }
unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part,
struct bio *bio)
{
*part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector);
return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio));
}
EXPORT_SYMBOL_GPL(part_start_io_acct);
unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
unsigned int op)
{
return __part_start_io_acct(&disk->part0, sectors, op);
}
EXPORT_SYMBOL(disk_start_io_acct); EXPORT_SYMBOL(disk_start_io_acct);
void disk_end_io_acct(struct gendisk *disk, unsigned int op, static void __part_end_io_acct(struct hd_struct *part, unsigned int op,
unsigned long start_time) unsigned long start_time)
{ {
struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op); const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies); unsigned long now = READ_ONCE(jiffies);
unsigned long duration = now - start_time; unsigned long duration = now - start_time;
...@@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op, ...@@ -1493,6 +1366,20 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op,
part_stat_local_dec(part, in_flight[op_is_write(op)]); part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock(); part_stat_unlock();
} }
void part_end_io_acct(struct hd_struct *part, struct bio *bio,
unsigned long start_time)
{
__part_end_io_acct(part, bio_op(bio), start_time);
hd_struct_put(part);
}
EXPORT_SYMBOL_GPL(part_end_io_acct);
void disk_end_io_acct(struct gendisk *disk, unsigned int op,
unsigned long start_time)
{
__part_end_io_acct(&disk->part0, op, start_time);
}
EXPORT_SYMBOL(disk_end_io_acct); EXPORT_SYMBOL(disk_end_io_acct);
/* /*
......
...@@ -408,7 +408,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template ...@@ -408,7 +408,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->tuple_size = template->tuple_size; bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size; bi->tag_size = template->tag_size;
disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION #ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->ksm) { if (disk->queue->ksm) {
...@@ -428,7 +428,7 @@ EXPORT_SYMBOL(blk_integrity_register); ...@@ -428,7 +428,7 @@ EXPORT_SYMBOL(blk_integrity_register);
*/ */
void blk_integrity_unregister(struct gendisk *disk) void blk_integrity_unregister(struct gendisk *disk)
{ {
disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue);
memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
} }
EXPORT_SYMBOL(blk_integrity_unregister); EXPORT_SYMBOL(blk_integrity_unregister);
......
This diff is collapsed.
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
#include "blk.h" #include "blk.h"
struct bio_map_data { struct bio_map_data {
int is_our_pages; bool is_our_pages : 1;
bool is_null_mapped : 1;
struct iov_iter iter; struct iov_iter iter;
struct iovec iov[]; struct iovec iov[];
}; };
...@@ -108,7 +109,7 @@ static int bio_uncopy_user(struct bio *bio) ...@@ -108,7 +109,7 @@ static int bio_uncopy_user(struct bio *bio)
struct bio_map_data *bmd = bio->bi_private; struct bio_map_data *bmd = bio->bi_private;
int ret = 0; int ret = 0;
if (!bio_flagged(bio, BIO_NULL_MAPPED)) { if (!bmd->is_null_mapped) {
/* /*
* if we're in a workqueue, the request is orphaned, so * if we're in a workqueue, the request is orphaned, so
* don't copy into a random user address space, just free * don't copy into a random user address space, just free
...@@ -126,24 +127,12 @@ static int bio_uncopy_user(struct bio *bio) ...@@ -126,24 +127,12 @@ static int bio_uncopy_user(struct bio *bio)
return ret; return ret;
} }
/** static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
* bio_copy_user_iov - copy user data to bio struct iov_iter *iter, gfp_t gfp_mask)
* @q: destination block queue
* @map_data: pointer to the rq_map_data holding pages (if necessary)
* @iter: iovec iterator
* @gfp_mask: memory allocation flags
*
* Prepares and returns a bio for indirect user io, bouncing data
* to/from kernel pages as necessary. Must be paired with
* call bio_uncopy_user() on io completion.
*/
static struct bio *bio_copy_user_iov(struct request_queue *q,
struct rq_map_data *map_data, struct iov_iter *iter,
gfp_t gfp_mask)
{ {
struct bio_map_data *bmd; struct bio_map_data *bmd;
struct page *page; struct page *page;
struct bio *bio; struct bio *bio, *bounce_bio;
int i = 0, ret; int i = 0, ret;
int nr_pages; int nr_pages;
unsigned int len = iter->count; unsigned int len = iter->count;
...@@ -151,14 +140,15 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, ...@@ -151,14 +140,15 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
bmd = bio_alloc_map_data(iter, gfp_mask); bmd = bio_alloc_map_data(iter, gfp_mask);
if (!bmd) if (!bmd)
return ERR_PTR(-ENOMEM); return -ENOMEM;
/* /*
* We need to do a deep copy of the iov_iter including the iovecs. * We need to do a deep copy of the iov_iter including the iovecs.
* The caller provided iov might point to an on-stack or otherwise * The caller provided iov might point to an on-stack or otherwise
* shortlived one. * shortlived one.
*/ */
bmd->is_our_pages = map_data ? 0 : 1; bmd->is_our_pages = !map_data;
bmd->is_null_mapped = (map_data && map_data->null_mapped);
nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
if (nr_pages > BIO_MAX_PAGES) if (nr_pages > BIO_MAX_PAGES)
...@@ -168,8 +158,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, ...@@ -168,8 +158,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
bio = bio_kmalloc(gfp_mask, nr_pages); bio = bio_kmalloc(gfp_mask, nr_pages);
if (!bio) if (!bio)
goto out_bmd; goto out_bmd;
bio->bi_opf |= req_op(rq);
ret = 0;
if (map_data) { if (map_data) {
nr_pages = 1 << map_data->page_order; nr_pages = 1 << map_data->page_order;
...@@ -186,7 +175,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, ...@@ -186,7 +175,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
if (map_data) { if (map_data) {
if (i == map_data->nr_entries * nr_pages) { if (i == map_data->nr_entries * nr_pages) {
ret = -ENOMEM; ret = -ENOMEM;
break; goto cleanup;
} }
page = map_data->pages[i / nr_pages]; page = map_data->pages[i / nr_pages];
...@@ -194,14 +183,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, ...@@ -194,14 +183,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
i++; i++;
} else { } else {
page = alloc_page(q->bounce_gfp | gfp_mask); page = alloc_page(rq->q->bounce_gfp | gfp_mask);
if (!page) { if (!page) {
ret = -ENOMEM; ret = -ENOMEM;
break; goto cleanup;
} }
} }
if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) {
if (!map_data) if (!map_data)
__free_page(page); __free_page(page);
break; break;
...@@ -211,9 +200,6 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, ...@@ -211,9 +200,6 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
offset = 0; offset = 0;
} }
if (ret)
goto cleanup;
if (map_data) if (map_data)
map_data->offset += bio->bi_iter.bi_size; map_data->offset += bio->bi_iter.bi_size;
...@@ -233,41 +219,42 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, ...@@ -233,41 +219,42 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
} }
bio->bi_private = bmd; bio->bi_private = bmd;
if (map_data && map_data->null_mapped)
bio_set_flag(bio, BIO_NULL_MAPPED); bounce_bio = bio;
return bio; ret = blk_rq_append_bio(rq, &bounce_bio);
if (ret)
goto cleanup;
/*
* We link the bounce buffer in and could have to traverse it later, so
* we have to get a ref to prevent it from being freed
*/
bio_get(bounce_bio);
return 0;
cleanup: cleanup:
if (!map_data) if (!map_data)
bio_free_pages(bio); bio_free_pages(bio);
bio_put(bio); bio_put(bio);
out_bmd: out_bmd:
kfree(bmd); kfree(bmd);
return ERR_PTR(ret); return ret;
} }
/** static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
* bio_map_user_iov - map user iovec into bio gfp_t gfp_mask)
* @q: the struct request_queue for the bio
* @iter: iovec iterator
* @gfp_mask: memory allocation flags
*
* Map the user space address into a bio suitable for io to a block
* device. Returns an error pointer in case of error.
*/
static struct bio *bio_map_user_iov(struct request_queue *q,
struct iov_iter *iter, gfp_t gfp_mask)
{ {
unsigned int max_sectors = queue_max_hw_sectors(q); unsigned int max_sectors = queue_max_hw_sectors(rq->q);
int j; struct bio *bio, *bounce_bio;
struct bio *bio;
int ret; int ret;
int j;
if (!iov_iter_count(iter)) if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL); return -EINVAL;
bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
if (!bio) if (!bio)
return ERR_PTR(-ENOMEM); return -ENOMEM;
bio->bi_opf |= req_op(rq);
while (iov_iter_count(iter)) { while (iov_iter_count(iter)) {
struct page **pages; struct page **pages;
...@@ -283,7 +270,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q, ...@@ -283,7 +270,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
if (unlikely(offs & queue_dma_alignment(q))) { if (unlikely(offs & queue_dma_alignment(rq->q))) {
ret = -EINVAL; ret = -EINVAL;
j = 0; j = 0;
} else { } else {
...@@ -295,7 +282,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q, ...@@ -295,7 +282,7 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
if (n > bytes) if (n > bytes)
n = bytes; n = bytes;
if (!bio_add_hw_page(q, bio, page, n, offs, if (!bio_add_hw_page(rq->q, bio, page, n, offs,
max_sectors, &same_page)) { max_sectors, &same_page)) {
if (same_page) if (same_page)
put_page(page); put_page(page);
...@@ -319,21 +306,31 @@ static struct bio *bio_map_user_iov(struct request_queue *q, ...@@ -319,21 +306,31 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
break; break;
} }
bio_set_flag(bio, BIO_USER_MAPPED);
/* /*
* subtle -- if bio_map_user_iov() ended up bouncing a bio, * Subtle: if we end up needing to bounce a bio, it would normally
* it would normally disappear when its bi_end_io is run. * disappear when its bi_end_io is run. However, we need the original
* however, we need it for the unmap, so grab an extra * bio for the unmap, so grab an extra reference to it
* reference to it
*/ */
bio_get(bio); bio_get(bio);
return bio;
bounce_bio = bio;
ret = blk_rq_append_bio(rq, &bounce_bio);
if (ret)
goto out_put_orig;
/*
* We link the bounce buffer in and could have to traverse it
* later, so we have to get a ref to prevent it from being freed
*/
bio_get(bounce_bio);
return 0;
out_put_orig:
bio_put(bio);
out_unmap: out_unmap:
bio_release_pages(bio, false); bio_release_pages(bio, false);
bio_put(bio); bio_put(bio);
return ERR_PTR(ret); return ret;
} }
/** /**
...@@ -557,55 +554,6 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio) ...@@ -557,55 +554,6 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
} }
EXPORT_SYMBOL(blk_rq_append_bio); EXPORT_SYMBOL(blk_rq_append_bio);
static int __blk_rq_unmap_user(struct bio *bio)
{
int ret = 0;
if (bio) {
if (bio_flagged(bio, BIO_USER_MAPPED))
bio_unmap_user(bio);
else
ret = bio_uncopy_user(bio);
}
return ret;
}
static int __blk_rq_map_user_iov(struct request *rq,
struct rq_map_data *map_data, struct iov_iter *iter,
gfp_t gfp_mask, bool copy)
{
struct request_queue *q = rq->q;
struct bio *bio, *orig_bio;
int ret;
if (copy)
bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
else
bio = bio_map_user_iov(q, iter, gfp_mask);
if (IS_ERR(bio))
return PTR_ERR(bio);
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
orig_bio = bio;
/*
* We link the bounce buffer in and could have to traverse it
* later so we have to get a ref to prevent it from being freed
*/
ret = blk_rq_append_bio(rq, &bio);
if (ret) {
__blk_rq_unmap_user(orig_bio);
return ret;
}
bio_get(bio);
return 0;
}
/** /**
* blk_rq_map_user_iov - map user data to a request, for passthrough requests * blk_rq_map_user_iov - map user data to a request, for passthrough requests
* @q: request queue where request should be inserted * @q: request queue where request should be inserted
...@@ -649,7 +597,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, ...@@ -649,7 +597,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
i = *iter; i = *iter;
do { do {
ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); if (copy)
ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask);
else
ret = bio_map_user_iov(rq, &i, gfp_mask);
if (ret) if (ret)
goto unmap_rq; goto unmap_rq;
if (!bio) if (!bio)
...@@ -700,9 +651,13 @@ int blk_rq_unmap_user(struct bio *bio) ...@@ -700,9 +651,13 @@ int blk_rq_unmap_user(struct bio *bio)
if (unlikely(bio_flagged(bio, BIO_BOUNCED))) if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
mapped_bio = bio->bi_private; mapped_bio = bio->bi_private;
ret2 = __blk_rq_unmap_user(mapped_bio); if (bio->bi_private) {
if (ret2 && !ret) ret2 = bio_uncopy_user(mapped_bio);
ret = ret2; if (ret2 && !ret)
ret = ret2;
} else {
bio_unmap_user(mapped_bio);
}
mapped_bio = bio; mapped_bio = bio;
bio = bio->bi_next; bio = bio->bi_next;
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <trace/events/block.h> #include <trace/events/block.h>
#include "blk.h" #include "blk.h"
#include "blk-rq-qos.h"
static inline bool bio_will_gap(struct request_queue *q, static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next) struct request *prev_rq, struct bio *prev, struct bio *next)
...@@ -895,3 +896,203 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) ...@@ -895,3 +896,203 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
return ELEVATOR_FRONT_MERGE; return ELEVATOR_FRONT_MERGE;
return ELEVATOR_NO_MERGE; return ELEVATOR_NO_MERGE;
} }
static void blk_account_io_merge_bio(struct request *req)
{
if (!blk_do_io_stat(req))
return;
part_stat_lock();
part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_unlock();
}
enum bio_merge_status bio_attempt_back_merge(struct request *req,
struct bio *bio,
unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_back_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
trace_block_bio_backmerge(req->q, req, bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
bio_crypt_free_ctx(bio);
blk_account_io_merge_bio(req);
return BIO_MERGE_OK;
}
enum bio_merge_status bio_attempt_front_merge(struct request *req,
struct bio *bio,
unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
if (!ll_front_merge_fn(req, bio, nr_segs))
return BIO_MERGE_FAILED;
trace_block_bio_frontmerge(req->q, req, bio);
rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
bio->bi_next = req->bio;
req->bio = bio;
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
bio_crypt_do_front_merge(req, bio);
blk_account_io_merge_bio(req);
return BIO_MERGE_OK;
}
enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
struct request *req,
struct bio *bio)
{
unsigned short segments = blk_rq_nr_discard_segments(req);
if (segments >= queue_max_discard_segments(q))
goto no_merge;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
rq_qos_merge(q, req, bio);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
blk_account_io_merge_bio(req);
return BIO_MERGE_OK;
no_merge:
req_set_nomerge(q, req);
return BIO_MERGE_FAILED;
}
static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
struct request *rq,
struct bio *bio,
unsigned int nr_segs,
bool sched_allow_merge)
{
if (!blk_rq_merge_ok(rq, bio))
return BIO_MERGE_NONE;
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
return bio_attempt_back_merge(rq, bio, nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
return bio_attempt_front_merge(rq, bio, nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
return bio_attempt_discard_merge(q, rq, bio);
default:
return BIO_MERGE_NONE;
}
return BIO_MERGE_FAILED;
}
/**
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
*
* Determine whether @bio being queued on @q can be merged with a request
* on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
* Plugging coalesces IOs from the same issuer for the same purpose without
* going through @q->queue_lock. As such it's more of an issuing mechanism
* than scheduling, and the request, while may have elvpriv data, is not
* added on the elevator at this point. In addition, we don't have
* reliable access to the elevator outside queue lock. Only check basic
* merging parameters without querying the elevator.
*
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
plug = blk_mq_plug(q, bio);
if (!plug)
return false;
plug_list = &plug->mq_list;
list_for_each_entry_reverse(rq, plug_list, queuelist) {
if (rq->q == q && same_queue_rq) {
/*
* Only blk-mq multiple hardware queues case checks the
* rq in the same queue, there should be only one such
* rq in a queue
**/
*same_queue_rq = rq;
}
if (rq->q != q)
continue;
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
BIO_MERGE_OK)
return true;
}
return false;
}
/*
* Iterate list of requests and see if we can merge this bio with any
* of them.
*/
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs)
{
struct request *rq;
int checked = 8;
list_for_each_entry_reverse(rq, list, queuelist) {
if (!checked--)
break;
switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
case BIO_MERGE_NONE:
continue;
case BIO_MERGE_OK:
return true;
case BIO_MERGE_FAILED:
return false;
}
}
return false;
}
EXPORT_SYMBOL_GPL(blk_bio_list_merge);
...@@ -116,6 +116,7 @@ static const char *const blk_queue_flag_name[] = { ...@@ -116,6 +116,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(SAME_FORCE),
QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(INIT_DONE),
QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(FUA),
...@@ -240,7 +241,7 @@ static const char *const alloc_policy_name[] = { ...@@ -240,7 +241,7 @@ static const char *const alloc_policy_name[] = {
#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name
static const char *const hctx_flag_name[] = { static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE), HCTX_FLAG_NAME(SHOULD_MERGE),
HCTX_FLAG_NAME(TAG_SHARED), HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING), HCTX_FLAG_NAME(STACKING),
...@@ -452,11 +453,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, ...@@ -452,11 +453,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
atomic_read(&tags->active_queues)); atomic_read(&tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n"); seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(&tags->bitmap_tags, m); sbitmap_queue_show(tags->bitmap_tags, m);
if (tags->nr_reserved_tags) { if (tags->nr_reserved_tags) {
seq_puts(m, "\nbreserved_tags:\n"); seq_puts(m, "\nbreserved_tags:\n");
sbitmap_queue_show(&tags->breserved_tags, m); sbitmap_queue_show(tags->breserved_tags, m);
} }
} }
...@@ -487,7 +488,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) ...@@ -487,7 +488,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
if (res) if (res)
goto out; goto out;
if (hctx->tags) if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
out: out:
...@@ -521,7 +522,7 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) ...@@ -521,7 +522,7 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
if (res) if (res)
goto out; goto out;
if (hctx->sched_tags) if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
out: out:
......
...@@ -18,21 +18,6 @@ ...@@ -18,21 +18,6 @@
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
#include "blk-wbt.h" #include "blk-wbt.h"
void blk_mq_sched_free_hctx_data(struct request_queue *q,
void (*exit)(struct blk_mq_hw_ctx *))
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (exit && hctx->sched_data)
exit(hctx);
kfree(hctx->sched_data);
hctx->sched_data = NULL;
}
}
EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
void blk_mq_sched_assign_ioc(struct request *rq) void blk_mq_sched_assign_ioc(struct request *rq)
{ {
struct request_queue *q = rq->q; struct request_queue *q = rq->q;
...@@ -368,7 +353,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, ...@@ -368,7 +353,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_BACK_MERGE: case ELEVATOR_BACK_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio)) if (!blk_mq_sched_allow_merge(q, rq, bio))
return false; return false;
if (!bio_attempt_back_merge(rq, bio, nr_segs)) if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
return false; return false;
*merged_request = attempt_back_merge(q, rq); *merged_request = attempt_back_merge(q, rq);
if (!*merged_request) if (!*merged_request)
...@@ -377,86 +362,20 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, ...@@ -377,86 +362,20 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_FRONT_MERGE: case ELEVATOR_FRONT_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio)) if (!blk_mq_sched_allow_merge(q, rq, bio))
return false; return false;
if (!bio_attempt_front_merge(rq, bio, nr_segs)) if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
return false; return false;
*merged_request = attempt_front_merge(q, rq); *merged_request = attempt_front_merge(q, rq);
if (!*merged_request) if (!*merged_request)
elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
return true; return true;
case ELEVATOR_DISCARD_MERGE: case ELEVATOR_DISCARD_MERGE:
return bio_attempt_discard_merge(q, rq, bio); return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
default: default:
return false; return false;
} }
} }
EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
/*
* Iterate list of requests and see if we can merge this bio with any
* of them.
*/
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs)
{
struct request *rq;
int checked = 8;
list_for_each_entry_reverse(rq, list, queuelist) {
bool merged = false;
if (!checked--)
break;
if (!blk_rq_merge_ok(rq, bio))
continue;
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
if (blk_mq_sched_allow_merge(q, rq, bio))
merged = bio_attempt_back_merge(rq, bio,
nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
if (blk_mq_sched_allow_merge(q, rq, bio))
merged = bio_attempt_front_merge(rq, bio,
nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
break;
default:
continue;
}
return merged;
}
return false;
}
EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
/*
* Reverse check our software queue for entries that we could potentially
* merge with. Currently includes a hand-wavy stop count of 8, to not spend
* too much time checking for merges.
*/
static bool blk_mq_attempt_merge(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx, struct bio *bio,
unsigned int nr_segs)
{
enum hctx_type type = hctx->type;
lockdep_assert_held(&ctx->lock);
if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
return true;
}
return false;
}
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs) unsigned int nr_segs)
{ {
...@@ -470,14 +389,24 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, ...@@ -470,14 +389,24 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
return e->type->ops.bio_merge(hctx, bio, nr_segs); return e->type->ops.bio_merge(hctx, bio, nr_segs);
type = hctx->type; type = hctx->type;
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
!list_empty_careful(&ctx->rq_lists[type])) { list_empty_careful(&ctx->rq_lists[type]))
/* default per sw-queue merge */ return false;
spin_lock(&ctx->lock);
ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); /* default per sw-queue merge */
spin_unlock(&ctx->lock); spin_lock(&ctx->lock);
/*
* Reverse check our software queue for entries that we could
* potentially merge with. Currently includes a hand-wavy stop
* count of 8, to not spend too much time checking for merges.
*/
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
ret = true;
} }
spin_unlock(&ctx->lock);
return ret; return ret;
} }
...@@ -531,7 +460,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, ...@@ -531,7 +460,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
goto run; goto run;
} }
WARN_ON(e && (rq->tag != -1)); WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG));
if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
/* /*
...@@ -616,9 +545,11 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, ...@@ -616,9 +545,11 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
if (hctx->sched_tags) { if (hctx->sched_tags) {
blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
blk_mq_free_rq_map(hctx->sched_tags); blk_mq_free_rq_map(hctx->sched_tags, flags);
hctx->sched_tags = NULL; hctx->sched_tags = NULL;
} }
} }
...@@ -628,10 +559,12 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q, ...@@ -628,10 +559,12 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_tag_set *set = q->tag_set;
/* Clear HCTX_SHARED so tags are init'ed */
unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
int ret; int ret;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
set->reserved_tags); set->reserved_tags, flags);
if (!hctx->sched_tags) if (!hctx->sched_tags)
return -ENOMEM; return -ENOMEM;
...@@ -649,8 +582,11 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q) ...@@ -649,8 +582,11 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
int i; int i;
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
/* Clear HCTX_SHARED so tags are freed */
unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
if (hctx->sched_tags) { if (hctx->sched_tags) {
blk_mq_free_rq_map(hctx->sched_tags); blk_mq_free_rq_map(hctx->sched_tags, flags);
hctx->sched_tags = NULL; hctx->sched_tags = NULL;
} }
} }
......
...@@ -5,9 +5,6 @@ ...@@ -5,9 +5,6 @@
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
void blk_mq_sched_free_hctx_data(struct request_queue *q,
void (*exit)(struct blk_mq_hw_ctx *));
void blk_mq_sched_assign_ioc(struct request *rq); void blk_mq_sched_assign_ioc(struct request *rq);
void blk_mq_sched_request_inserted(struct request *rq); void blk_mq_sched_request_inserted(struct request *rq);
......
...@@ -23,9 +23,18 @@ ...@@ -23,9 +23,18 @@
*/ */
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{ {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && if (blk_mq_is_sbitmap_shared(hctx->flags)) {
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) struct request_queue *q = hctx->queue;
atomic_inc(&hctx->tags->active_queues); struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
atomic_inc(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);
}
return true; return true;
} }
...@@ -35,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) ...@@ -35,9 +44,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/ */
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{ {
sbitmap_queue_wake_all(&tags->bitmap_tags); sbitmap_queue_wake_all(tags->bitmap_tags);
if (include_reserve) if (include_reserve)
sbitmap_queue_wake_all(&tags->breserved_tags); sbitmap_queue_wake_all(tags->breserved_tags);
} }
/* /*
...@@ -47,11 +56,19 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) ...@@ -47,11 +56,19 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{ {
struct blk_mq_tags *tags = hctx->tags; struct blk_mq_tags *tags = hctx->tags;
struct request_queue *q = hctx->queue;
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) struct blk_mq_tag_set *set = q->tag_set;
return;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
atomic_dec(&tags->active_queues); if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags))
return;
atomic_dec(&set->active_queues_shared_sbitmap);
} else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
atomic_dec(&tags->active_queues);
}
blk_mq_tag_wakeup_all(tags, false); blk_mq_tag_wakeup_all(tags, false);
} }
...@@ -59,7 +76,8 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) ...@@ -59,7 +76,8 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt) struct sbitmap_queue *bt)
{ {
if (!data->q->elevator && !hctx_may_queue(data->hctx, bt)) if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
!hctx_may_queue(data->hctx, bt))
return BLK_MQ_NO_TAG; return BLK_MQ_NO_TAG;
if (data->shallow_depth) if (data->shallow_depth)
...@@ -82,10 +100,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -82,10 +100,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG; return BLK_MQ_NO_TAG;
} }
bt = &tags->breserved_tags; bt = tags->breserved_tags;
tag_offset = 0; tag_offset = 0;
} else { } else {
bt = &tags->bitmap_tags; bt = tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags; tag_offset = tags->nr_reserved_tags;
} }
...@@ -131,9 +149,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -131,9 +149,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx); data->ctx);
tags = blk_mq_tags_from_data(data); tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED) if (data->flags & BLK_MQ_REQ_RESERVED)
bt = &tags->breserved_tags; bt = tags->breserved_tags;
else else
bt = &tags->bitmap_tags; bt = tags->bitmap_tags;
/* /*
* If destination hw queue is changed, fake wake up on * If destination hw queue is changed, fake wake up on
...@@ -167,10 +185,10 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, ...@@ -167,10 +185,10 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
const int real_tag = tag - tags->nr_reserved_tags; const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags); BUG_ON(real_tag >= tags->nr_tags);
sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
} else { } else {
BUG_ON(tag >= tags->nr_reserved_tags); BUG_ON(tag >= tags->nr_reserved_tags);
sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
} }
} }
...@@ -197,7 +215,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) ...@@ -197,7 +215,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* We can hit rq == NULL here, because the tagging functions * We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[]. * test and set the bit before assigning ->rqs[].
*/ */
if (rq && rq->q == hctx->queue) if (rq && rq->q == hctx->queue && rq->mq_hctx == hctx)
return iter_data->fn(hctx, rq, iter_data->data, reserved); return iter_data->fn(hctx, rq, iter_data->data, reserved);
return true; return true;
} }
...@@ -298,9 +316,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, ...@@ -298,9 +316,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags) if (tags->nr_reserved_tags)
bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED); flags | BT_TAG_ITER_RESERVED);
bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
} }
/** /**
...@@ -416,8 +434,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, ...@@ -416,8 +434,8 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
continue; continue;
if (tags->nr_reserved_tags) if (tags->nr_reserved_tags)
bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
} }
blk_queue_exit(q); blk_queue_exit(q);
} }
...@@ -429,30 +447,64 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, ...@@ -429,30 +447,64 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
node); node);
} }
static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy) int node, int alloc_policy)
{ {
unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node))
goto free_tags; return -ENOMEM;
if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags,
node)) round_robin, node))
goto free_bitmap_tags; goto free_bitmap_tags;
return tags; tags->bitmap_tags = &tags->__bitmap_tags;
tags->breserved_tags = &tags->__breserved_tags;
return 0;
free_bitmap_tags: free_bitmap_tags:
sbitmap_queue_free(&tags->bitmap_tags); sbitmap_queue_free(&tags->__bitmap_tags);
free_tags: return -ENOMEM;
kfree(tags); }
return NULL;
int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
{
unsigned int depth = set->queue_depth - set->reserved_tags;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
int i, node = set->numa_node;
if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
return -ENOMEM;
if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
round_robin, node))
goto free_bitmap_tags;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
tags->bitmap_tags = &set->__bitmap_tags;
tags->breserved_tags = &set->__breserved_tags;
}
return 0;
free_bitmap_tags:
sbitmap_queue_free(&set->__bitmap_tags);
return -ENOMEM;
}
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
{
sbitmap_queue_free(&set->__bitmap_tags);
sbitmap_queue_free(&set->__breserved_tags);
} }
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags, unsigned int reserved_tags,
int node, int alloc_policy) int node, unsigned int flags)
{ {
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) { if (total_tags > BLK_MQ_TAG_MAX) {
...@@ -467,13 +519,22 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, ...@@ -467,13 +519,22 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags; tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags; tags->nr_reserved_tags = reserved_tags;
return blk_mq_init_bitmap_tags(tags, node, alloc_policy); if (flags & BLK_MQ_F_TAG_HCTX_SHARED)
return tags;
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
kfree(tags);
return NULL;
}
return tags;
} }
void blk_mq_free_tags(struct blk_mq_tags *tags) void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
{ {
sbitmap_queue_free(&tags->bitmap_tags); if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) {
sbitmap_queue_free(&tags->breserved_tags); sbitmap_queue_free(tags->bitmap_tags);
sbitmap_queue_free(tags->breserved_tags);
}
kfree(tags); kfree(tags);
} }
...@@ -492,6 +553,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, ...@@ -492,6 +553,8 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
*/ */
if (tdepth > tags->nr_tags) { if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tag_set *set = hctx->queue->tag_set;
/* Only sched tags can grow, so clear HCTX_SHARED flag */
unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
struct blk_mq_tags *new; struct blk_mq_tags *new;
bool ret; bool ret;
...@@ -506,30 +569,35 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, ...@@ -506,30 +569,35 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
return -EINVAL; return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
tags->nr_reserved_tags); tags->nr_reserved_tags, flags);
if (!new) if (!new)
return -ENOMEM; return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) { if (ret) {
blk_mq_free_rq_map(new); blk_mq_free_rq_map(new, flags);
return -ENOMEM; return -ENOMEM;
} }
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
blk_mq_free_rq_map(*tagsptr); blk_mq_free_rq_map(*tagsptr, flags);
*tagsptr = new; *tagsptr = new;
} else { } else {
/* /*
* Don't need (or can't) update reserved tags here, they * Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing. * remain static and should never need resizing.
*/ */
sbitmap_queue_resize(&tags->bitmap_tags, sbitmap_queue_resize(tags->bitmap_tags,
tdepth - tags->nr_reserved_tags); tdepth - tags->nr_reserved_tags);
} }
return 0; return 0;
} }
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
{
sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
}
/** /**
* blk_mq_unique_tag() - return a tag that is unique queue-wide * blk_mq_unique_tag() - return a tag that is unique queue-wide
* @rq: request for which to compute a unique tag * @rq: request for which to compute a unique tag
......
...@@ -2,8 +2,6 @@ ...@@ -2,8 +2,6 @@
#ifndef INT_BLK_MQ_TAG_H #ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H
#include "blk-mq.h"
/* /*
* Tag address space map. * Tag address space map.
*/ */
...@@ -13,17 +11,25 @@ struct blk_mq_tags { ...@@ -13,17 +11,25 @@ struct blk_mq_tags {
atomic_t active_queues; atomic_t active_queues;
struct sbitmap_queue bitmap_tags; struct sbitmap_queue *bitmap_tags;
struct sbitmap_queue breserved_tags; struct sbitmap_queue *breserved_tags;
struct sbitmap_queue __bitmap_tags;
struct sbitmap_queue __breserved_tags;
struct request **rqs; struct request **rqs;
struct request **static_rqs; struct request **static_rqs;
struct list_head page_list; struct list_head page_list;
}; };
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags,
int node, unsigned int flags);
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set,
extern void blk_mq_free_tags(struct blk_mq_tags *tags); unsigned int flags);
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
...@@ -31,6 +37,9 @@ extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, ...@@ -31,6 +37,9 @@ extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags, struct blk_mq_tags **tags,
unsigned int depth, bool can_grow); unsigned int depth, bool can_grow);
extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
unsigned int size);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv); void *priv);
...@@ -56,7 +65,7 @@ extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); ...@@ -56,7 +65,7 @@ extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{ {
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return false; return false;
return __blk_mq_tag_busy(hctx); return __blk_mq_tag_busy(hctx);
...@@ -64,43 +73,12 @@ static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) ...@@ -64,43 +73,12 @@ static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{ {
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return; return;
__blk_mq_tag_idle(hctx); __blk_mq_tag_idle(hctx);
} }
/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct sbitmap_queue *bt)
{
unsigned int depth, users;
if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
return true;
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
/*
* Don't try dividing an ant
*/
if (bt->sb.depth == 1)
return true;
users = atomic_read(&hctx->tags->active_queues);
if (!users)
return true;
/*
* Allow at least some tags
*/
depth = max((bt->sb.depth + users - 1) / users, 4U);
return atomic_read(&hctx->nr_active) < depth;
}
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
unsigned int tag) unsigned int tag)
{ {
......
...@@ -519,7 +519,7 @@ void blk_mq_free_request(struct request *rq) ...@@ -519,7 +519,7 @@ void blk_mq_free_request(struct request *rq)
ctx->rq_completed[rq_is_sync(rq)]++; ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT) if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active); __blk_mq_dec_active_requests(hctx);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
laptop_io_completion(q->backing_dev_info); laptop_io_completion(q->backing_dev_info);
...@@ -1096,19 +1096,20 @@ static inline unsigned int queued_to_index(unsigned int queued) ...@@ -1096,19 +1096,20 @@ static inline unsigned int queued_to_index(unsigned int queued)
static bool __blk_mq_get_driver_tag(struct request *rq) static bool __blk_mq_get_driver_tag(struct request *rq)
{ {
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
int tag; int tag;
blk_mq_tag_busy(rq->mq_hctx); blk_mq_tag_busy(rq->mq_hctx);
if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
bt = &rq->mq_hctx->tags->breserved_tags; bt = rq->mq_hctx->tags->breserved_tags;
tag_offset = 0; tag_offset = 0;
} else {
if (!hctx_may_queue(rq->mq_hctx, bt))
return false;
} }
if (!hctx_may_queue(rq->mq_hctx, bt))
return false;
tag = __sbitmap_queue_get(bt); tag = __sbitmap_queue_get(bt);
if (tag == BLK_MQ_NO_TAG) if (tag == BLK_MQ_NO_TAG)
return false; return false;
...@@ -1124,10 +1125,10 @@ static bool blk_mq_get_driver_tag(struct request *rq) ...@@ -1124,10 +1125,10 @@ static bool blk_mq_get_driver_tag(struct request *rq)
if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
return false; return false;
if ((hctx->flags & BLK_MQ_F_TAG_SHARED) && if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
!(rq->rq_flags & RQF_MQ_INFLIGHT)) { !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
rq->rq_flags |= RQF_MQ_INFLIGHT; rq->rq_flags |= RQF_MQ_INFLIGHT;
atomic_inc(&hctx->nr_active); __blk_mq_inc_active_requests(hctx);
} }
hctx->tags->rqs[rq->tag] = rq; hctx->tags->rqs[rq->tag] = rq;
return true; return true;
...@@ -1145,7 +1146,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, ...@@ -1145,7 +1146,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
struct sbitmap_queue *sbq; struct sbitmap_queue *sbq;
list_del_init(&wait->entry); list_del_init(&wait->entry);
sbq = &hctx->tags->bitmap_tags; sbq = hctx->tags->bitmap_tags;
atomic_dec(&sbq->ws_active); atomic_dec(&sbq->ws_active);
} }
spin_unlock(&hctx->dispatch_wait_lock); spin_unlock(&hctx->dispatch_wait_lock);
...@@ -1163,12 +1164,12 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, ...@@ -1163,12 +1164,12 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq) struct request *rq)
{ {
struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; struct sbitmap_queue *sbq = hctx->tags->bitmap_tags;
struct wait_queue_head *wq; struct wait_queue_head *wq;
wait_queue_entry_t *wait; wait_queue_entry_t *wait;
bool ret; bool ret;
if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
blk_mq_sched_mark_restart_hctx(hctx); blk_mq_sched_mark_restart_hctx(hctx);
/* /*
...@@ -1420,7 +1421,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, ...@@ -1420,7 +1421,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
bool needs_restart; bool needs_restart;
/* For non-shared tags, the RESTART check will suffice */ /* For non-shared tags, the RESTART check will suffice */
bool no_tag = prep == PREP_DISPATCH_NO_TAG && bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
(hctx->flags & BLK_MQ_F_TAG_SHARED); (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED);
bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
blk_mq_release_budgets(q, nr_budgets); blk_mq_release_budgets(q, nr_budgets);
...@@ -2296,20 +2297,21 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, ...@@ -2296,20 +2297,21 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
} }
} }
void blk_mq_free_rq_map(struct blk_mq_tags *tags) void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
{ {
kfree(tags->rqs); kfree(tags->rqs);
tags->rqs = NULL; tags->rqs = NULL;
kfree(tags->static_rqs); kfree(tags->static_rqs);
tags->static_rqs = NULL; tags->static_rqs = NULL;
blk_mq_free_tags(tags); blk_mq_free_tags(tags, flags);
} }
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int hctx_idx,
unsigned int nr_tags, unsigned int nr_tags,
unsigned int reserved_tags) unsigned int reserved_tags,
unsigned int flags)
{ {
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
int node; int node;
...@@ -2318,8 +2320,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, ...@@ -2318,8 +2320,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
if (node == NUMA_NO_NODE) if (node == NUMA_NO_NODE)
node = set->numa_node; node = set->numa_node;
tags = blk_mq_init_tags(nr_tags, reserved_tags, node, tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags) if (!tags)
return NULL; return NULL;
...@@ -2327,7 +2328,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, ...@@ -2327,7 +2328,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node); node);
if (!tags->rqs) { if (!tags->rqs) {
blk_mq_free_tags(tags); blk_mq_free_tags(tags, flags);
return NULL; return NULL;
} }
...@@ -2336,7 +2337,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, ...@@ -2336,7 +2337,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
node); node);
if (!tags->static_rqs) { if (!tags->static_rqs) {
kfree(tags->rqs); kfree(tags->rqs);
blk_mq_free_tags(tags); blk_mq_free_tags(tags, flags);
return NULL; return NULL;
} }
...@@ -2660,6 +2661,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, ...@@ -2660,6 +2661,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
goto free_hctx; goto free_hctx;
atomic_set(&hctx->nr_active, 0); atomic_set(&hctx->nr_active, 0);
atomic_set(&hctx->elevator_queued, 0);
if (node == NUMA_NO_NODE) if (node == NUMA_NO_NODE)
node = set->numa_node; node = set->numa_node;
hctx->numa_node = node; hctx->numa_node = node;
...@@ -2668,7 +2670,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, ...@@ -2668,7 +2670,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
spin_lock_init(&hctx->lock); spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch); INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q; hctx->queue = q;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
INIT_LIST_HEAD(&hctx->hctx_list); INIT_LIST_HEAD(&hctx->hctx_list);
...@@ -2745,10 +2747,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, ...@@ -2745,10 +2747,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
int hctx_idx) int hctx_idx)
{ {
unsigned int flags = set->flags;
int ret = 0; int ret = 0;
set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
set->queue_depth, set->reserved_tags); set->queue_depth, set->reserved_tags, flags);
if (!set->tags[hctx_idx]) if (!set->tags[hctx_idx])
return false; return false;
...@@ -2757,7 +2760,7 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, ...@@ -2757,7 +2760,7 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
if (!ret) if (!ret)
return true; return true;
blk_mq_free_rq_map(set->tags[hctx_idx]); blk_mq_free_rq_map(set->tags[hctx_idx], flags);
set->tags[hctx_idx] = NULL; set->tags[hctx_idx] = NULL;
return false; return false;
} }
...@@ -2765,9 +2768,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, ...@@ -2765,9 +2768,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
unsigned int flags = set->flags;
if (set->tags && set->tags[hctx_idx]) { if (set->tags && set->tags[hctx_idx]) {
blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
blk_mq_free_rq_map(set->tags[hctx_idx]); blk_mq_free_rq_map(set->tags[hctx_idx], flags);
set->tags[hctx_idx] = NULL; set->tags[hctx_idx] = NULL;
} }
} }
...@@ -2885,14 +2890,14 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared) ...@@ -2885,14 +2890,14 @@ static void queue_set_hctx_shared(struct request_queue *q, bool shared)
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
if (shared) if (shared)
hctx->flags |= BLK_MQ_F_TAG_SHARED; hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
else else
hctx->flags &= ~BLK_MQ_F_TAG_SHARED; hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
} }
} }
static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
bool shared) bool shared)
{ {
struct request_queue *q; struct request_queue *q;
...@@ -2913,9 +2918,9 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) ...@@ -2913,9 +2918,9 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
list_del(&q->tag_set_list); list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) { if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */ /* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_SHARED; set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */ /* update existing queue */
blk_mq_update_tag_set_depth(set, false); blk_mq_update_tag_set_shared(set, false);
} }
mutex_unlock(&set->tag_list_lock); mutex_unlock(&set->tag_list_lock);
INIT_LIST_HEAD(&q->tag_set_list); INIT_LIST_HEAD(&q->tag_set_list);
...@@ -2930,12 +2935,12 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, ...@@ -2930,12 +2935,12 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
* Check to see if we're transitioning to shared (from 1 to 2 queues). * Check to see if we're transitioning to shared (from 1 to 2 queues).
*/ */
if (!list_empty(&set->tag_list) && if (!list_empty(&set->tag_list) &&
!(set->flags & BLK_MQ_F_TAG_SHARED)) { !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
set->flags |= BLK_MQ_F_TAG_SHARED; set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */ /* update existing queue */
blk_mq_update_tag_set_depth(set, true); blk_mq_update_tag_set_shared(set, true);
} }
if (set->flags & BLK_MQ_F_TAG_SHARED) if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
queue_set_hctx_shared(q, true); queue_set_hctx_shared(q, true);
list_add_tail(&q->tag_set_list, &set->tag_list); list_add_tail(&q->tag_set_list, &set->tag_list);
...@@ -3438,11 +3443,23 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) ...@@ -3438,11 +3443,23 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret) if (ret)
goto out_free_mq_map; goto out_free_mq_map;
if (blk_mq_is_sbitmap_shared(set->flags)) {
atomic_set(&set->active_queues_shared_sbitmap, 0);
if (blk_mq_init_shared_sbitmap(set, set->flags)) {
ret = -ENOMEM;
goto out_free_mq_rq_maps;
}
}
mutex_init(&set->tag_list_lock); mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list); INIT_LIST_HEAD(&set->tag_list);
return 0; return 0;
out_free_mq_rq_maps:
for (i = 0; i < set->nr_hw_queues; i++)
blk_mq_free_map_and_requests(set, i);
out_free_mq_map: out_free_mq_map:
for (i = 0; i < set->nr_maps; i++) { for (i = 0; i < set->nr_maps; i++) {
kfree(set->map[i].mq_map); kfree(set->map[i].mq_map);
...@@ -3461,6 +3478,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) ...@@ -3461,6 +3478,9 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
for (i = 0; i < set->nr_hw_queues; i++) for (i = 0; i < set->nr_hw_queues; i++)
blk_mq_free_map_and_requests(set, i); blk_mq_free_map_and_requests(set, i);
if (blk_mq_is_sbitmap_shared(set->flags))
blk_mq_exit_shared_sbitmap(set);
for (j = 0; j < set->nr_maps; j++) { for (j = 0; j < set->nr_maps; j++) {
kfree(set->map[j].mq_map); kfree(set->map[j].mq_map);
set->map[j].mq_map = NULL; set->map[j].mq_map = NULL;
...@@ -3497,6 +3517,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) ...@@ -3497,6 +3517,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
if (!hctx->sched_tags) { if (!hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
false); false);
if (!ret && blk_mq_is_sbitmap_shared(set->flags))
blk_mq_tag_resize_shared_sbitmap(set, nr);
} else { } else {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
nr, true); nr, true);
......
...@@ -53,11 +53,12 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, ...@@ -53,11 +53,12 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
*/ */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx); unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags); void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int hctx_idx,
unsigned int nr_tags, unsigned int nr_tags,
unsigned int reserved_tags); unsigned int reserved_tags,
unsigned int flags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth); unsigned int hctx_idx, unsigned int depth);
...@@ -158,6 +159,11 @@ struct blk_mq_alloc_data { ...@@ -158,6 +159,11 @@ struct blk_mq_alloc_data {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
}; };
static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
{
return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{ {
if (data->q->elevator) if (data->q->elevator)
...@@ -193,6 +199,28 @@ static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) ...@@ -193,6 +199,28 @@ static inline bool blk_mq_get_dispatch_budget(struct request_queue *q)
return true; return true;
} }
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
else
atomic_inc(&hctx->nr_active);
}
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
else
atomic_dec(&hctx->nr_active);
}
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq) struct request *rq)
{ {
...@@ -201,7 +229,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, ...@@ -201,7 +229,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
if (rq->rq_flags & RQF_MQ_INFLIGHT) { if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT; rq->rq_flags &= ~RQF_MQ_INFLIGHT;
atomic_dec(&hctx->nr_active); __blk_mq_dec_active_requests(hctx);
} }
} }
...@@ -253,4 +281,46 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q, ...@@ -253,4 +281,46 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
return NULL; return NULL;
} }
/*
* For shared tag users, we track the number of currently active users
* and attempt to provide a fair share of the tag depth for each of them.
*/
static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
struct sbitmap_queue *bt)
{
unsigned int depth, users;
if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
return true;
/*
* Don't try dividing an ant
*/
if (bt->sb.depth == 1)
return true;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &q->queue_flags))
return true;
users = atomic_read(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
users = atomic_read(&hctx->tags->active_queues);
}
if (!users)
return true;
/*
* Allow at least some tags
*/
depth = max((bt->sb.depth + users - 1) / users, 4U);
return __blk_mq_active_requests(hctx) < depth;
}
#endif #endif
...@@ -172,15 +172,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors); ...@@ -172,15 +172,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
* *
* Description: * Description:
* If a driver doesn't want IOs to cross a given chunk size, it can set * If a driver doesn't want IOs to cross a given chunk size, it can set
* this limit and prevent merging across chunks. Note that the chunk size * this limit and prevent merging across chunks. Note that the block layer
* must currently be a power-of-2 in sectors. Also note that the block * must accept a page worth of data at any offset. So if the crossing of
* layer must accept a page worth of data at any offset. So if the * chunks is a hard limitation in the driver, it must still be prepared
* crossing of chunks is a hard limitation in the driver, it must still be * to split single page bios.
* prepared to split single page bios.
**/ **/
void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
{ {
BUG_ON(!is_power_of_2(chunk_sectors));
q->limits.chunk_sectors = chunk_sectors; q->limits.chunk_sectors = chunk_sectors;
} }
EXPORT_SYMBOL(blk_queue_chunk_sectors); EXPORT_SYMBOL(blk_queue_chunk_sectors);
...@@ -374,6 +372,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) ...@@ -374,6 +372,19 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
} }
EXPORT_SYMBOL(blk_queue_alignment_offset); EXPORT_SYMBOL(blk_queue_alignment_offset);
void blk_queue_update_readahead(struct request_queue *q)
{
/*
* For read-ahead of large files to be effective, we need to read ahead
* at least twice the optimal I/O size.
*/
q->backing_dev_info->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
q->backing_dev_info->io_pages =
queue_max_sectors(q) >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL_GPL(blk_queue_update_readahead);
/** /**
* blk_limits_io_min - set minimum request size for a device * blk_limits_io_min - set minimum request size for a device
* @limits: the queue limits * @limits: the queue limits
...@@ -452,6 +463,8 @@ EXPORT_SYMBOL(blk_limits_io_opt); ...@@ -452,6 +463,8 @@ EXPORT_SYMBOL(blk_limits_io_opt);
void blk_queue_io_opt(struct request_queue *q, unsigned int opt) void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
{ {
blk_limits_io_opt(&q->limits, opt); blk_limits_io_opt(&q->limits, opt);
q->backing_dev_info->ra_pages =
max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
} }
EXPORT_SYMBOL(blk_queue_io_opt); EXPORT_SYMBOL(blk_queue_io_opt);
...@@ -534,6 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ...@@ -534,6 +547,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_min = max(t->io_min, b->io_min); t->io_min = max(t->io_min, b->io_min);
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
t->chunk_sectors = lcm_not_zero(t->chunk_sectors, b->chunk_sectors);
/* Physical block size a multiple of the logical block size? */ /* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) { if (t->physical_block_size & (t->logical_block_size - 1)) {
...@@ -556,6 +570,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ...@@ -556,6 +570,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
ret = -1; ret = -1;
} }
/* chunk_sectors a multiple of the physical block size? */
if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
t->chunk_sectors = 0;
t->misaligned = 1;
ret = -1;
}
t->raid_partial_stripes_expensive = t->raid_partial_stripes_expensive =
max(t->raid_partial_stripes_expensive, max(t->raid_partial_stripes_expensive,
b->raid_partial_stripes_expensive); b->raid_partial_stripes_expensive);
...@@ -594,10 +615,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ...@@ -594,10 +615,6 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->discard_granularity; t->discard_granularity;
} }
if (b->chunk_sectors)
t->chunk_sectors = min_not_zero(t->chunk_sectors,
b->chunk_sectors);
t->zoned = max(t->zoned, b->zoned); t->zoned = max(t->zoned, b->zoned);
return ret; return ret;
} }
...@@ -629,8 +646,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, ...@@ -629,8 +646,7 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
top, bottom); top, bottom);
} }
t->backing_dev_info->io_pages = blk_queue_update_readahead(disk->queue);
t->limits.max_sectors >> (PAGE_SHIFT - 9);
} }
EXPORT_SYMBOL(disk_stack_limits); EXPORT_SYMBOL(disk_stack_limits);
......
This diff is collapsed.
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include "blk-cgroup-rwstat.h" #include "blk-cgroup-rwstat.h"
/* Max dispatch from a group in 1 round */ /* Max dispatch from a group in 1 round */
static int throtl_grp_quantum = 8; #define THROTL_GRP_QUANTUM 8
/* Total max dispatch from all groups in one round */ /* Total max dispatch from all groups in one round */
static int throtl_quantum = 32; #define THROTL_QUANTUM 32
/* Throttling is performed over a slice and after that slice is renewed */ /* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_HD (HZ / 10)
...@@ -150,7 +150,7 @@ struct throtl_grp { ...@@ -150,7 +150,7 @@ struct throtl_grp {
/* user configured IOPS limits */ /* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT]; unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes disptached in current slice */ /* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2]; uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */ /* Number of bio's dispatched in current slice */
unsigned int io_disp[2]; unsigned int io_disp[2];
...@@ -852,7 +852,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) ...@@ -852,7 +852,7 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
/* /*
* A bio has been dispatched. Also adjust slice_end. It might happen * A bio has been dispatched. Also adjust slice_end. It might happen
* that initially cgroup limit was very low resulting in high * that initially cgroup limit was very low resulting in high
* slice_end, but later limit was bumped up and bio was dispached * slice_end, but later limit was bumped up and bio was dispatched
* sooner, then we need to reduce slice_end. A high bogus slice_end * sooner, then we need to reduce slice_end. A high bogus slice_end
* is bad because it does not allow new slice to start. * is bad because it does not allow new slice to start.
*/ */
...@@ -894,13 +894,19 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) ...@@ -894,13 +894,19 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
} }
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long *wait) u32 iops_limit, unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
unsigned int io_allowed; unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp; u64 tmp;
if (iops_limit == UINT_MAX) {
if (wait)
*wait = 0;
return true;
}
jiffy_elapsed = jiffies - tg->slice_start[rw]; jiffy_elapsed = jiffies - tg->slice_start[rw];
/* Round up to the next throttle slice, wait time must be nonzero */ /* Round up to the next throttle slice, wait time must be nonzero */
...@@ -913,7 +919,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, ...@@ -913,7 +919,7 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
* have been trimmed. * have been trimmed.
*/ */
tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; tmp = (u64)iops_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ); do_div(tmp, HZ);
if (tmp > UINT_MAX) if (tmp > UINT_MAX)
...@@ -936,13 +942,19 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, ...@@ -936,13 +942,19 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
} }
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long *wait) u64 bps_limit, unsigned long *wait)
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp; u64 bytes_allowed, extra_bytes, tmp;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio); unsigned int bio_size = throtl_bio_data_size(bio);
if (bps_limit == U64_MAX) {
if (wait)
*wait = 0;
return true;
}
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */ /* Slice has just started. Consider one slice interval */
...@@ -951,7 +963,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, ...@@ -951,7 +963,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; tmp = bps_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ); do_div(tmp, HZ);
bytes_allowed = tmp; bytes_allowed = tmp;
...@@ -963,7 +975,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, ...@@ -963,7 +975,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* Calc approx time to dispatch */ /* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
if (!jiffy_wait) if (!jiffy_wait)
jiffy_wait = 1; jiffy_wait = 1;
...@@ -987,6 +999,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, ...@@ -987,6 +999,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
{ {
bool rw = bio_data_dir(bio); bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
/* /*
* Currently whole state machine of group depends on first bio * Currently whole state machine of group depends on first bio
...@@ -998,8 +1012,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, ...@@ -998,8 +1012,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw])); bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */ /* If tg->bps = -1, then BW is unlimited */
if (tg_bps_limit(tg, rw) == U64_MAX && if (bps_limit == U64_MAX && iops_limit == UINT_MAX) {
tg_iops_limit(tg, rw) == UINT_MAX) {
if (wait) if (wait)
*wait = 0; *wait = 0;
return true; return true;
...@@ -1021,8 +1034,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, ...@@ -1021,8 +1034,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice); jiffies + tg->td->throtl_slice);
} }
if (tg_with_in_bps_limit(tg, bio, &bps_wait) && if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, &iops_wait)) { tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait) if (wait)
*wait = 0; *wait = 0;
return true; return true;
...@@ -1082,7 +1095,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, ...@@ -1082,7 +1095,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* If @tg doesn't currently have any bios queued in the same * If @tg doesn't currently have any bios queued in the same
* direction, queueing @bio can change when @tg should be * direction, queueing @bio can change when @tg should be
* dispatched. Mark that @tg was empty. This is automatically * dispatched. Mark that @tg was empty. This is automatically
* cleaered on the next tg_update_disptime(). * cleared on the next tg_update_disptime().
*/ */
if (!sq->nr_queued[rw]) if (!sq->nr_queued[rw])
tg->flags |= THROTL_TG_WAS_EMPTY; tg->flags |= THROTL_TG_WAS_EMPTY;
...@@ -1175,8 +1188,8 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) ...@@ -1175,8 +1188,8 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
{ {
struct throtl_service_queue *sq = &tg->service_queue; struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0; unsigned int nr_reads = 0, nr_writes = 0;
unsigned int max_nr_reads = throtl_grp_quantum*3/4; unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
struct bio *bio; struct bio *bio;
/* Try to dispatch 75% READS and 25% WRITES */ /* Try to dispatch 75% READS and 25% WRITES */
...@@ -1226,7 +1239,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) ...@@ -1226,7 +1239,7 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
if (sq->nr_queued[0] || sq->nr_queued[1]) if (sq->nr_queued[0] || sq->nr_queued[1])
tg_update_disptime(tg); tg_update_disptime(tg);
if (nr_disp >= throtl_quantum) if (nr_disp >= THROTL_QUANTUM)
break; break;
} }
...@@ -1303,7 +1316,7 @@ static void throtl_pending_timer_fn(struct timer_list *t) ...@@ -1303,7 +1316,7 @@ static void throtl_pending_timer_fn(struct timer_list *t)
} }
} }
} else { } else {
/* reached the top-level, queue issueing */ /* reached the top-level, queue issuing */
queue_work(kthrotld_workqueue, &td->dispatch_work); queue_work(kthrotld_workqueue, &td->dispatch_work);
} }
out_unlock: out_unlock:
...@@ -1314,8 +1327,8 @@ static void throtl_pending_timer_fn(struct timer_list *t) ...@@ -1314,8 +1327,8 @@ static void throtl_pending_timer_fn(struct timer_list *t)
* blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
* @work: work item being executed * @work: work item being executed
* *
* This function is queued for execution when bio's reach the bio_lists[] * This function is queued for execution when bios reach the bio_lists[]
* of throtl_data->service_queue. Those bio's are ready and issued by this * of throtl_data->service_queue. Those bios are ready and issued by this
* function. * function.
*/ */
static void blk_throtl_dispatch_work_fn(struct work_struct *work) static void blk_throtl_dispatch_work_fn(struct work_struct *work)
...@@ -1428,8 +1441,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) ...@@ -1428,8 +1441,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* that a group's limit are dropped suddenly and we don't want to * that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate. * account recently dispatched IO with new low rate.
*/ */
throtl_start_new_slice(tg, 0); throtl_start_new_slice(tg, READ);
throtl_start_new_slice(tg, 1); throtl_start_new_slice(tg, WRITE);
if (tg->flags & THROTL_TG_PENDING) { if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg); tg_update_disptime(tg);
...@@ -2230,7 +2243,7 @@ bool blk_throtl_bio(struct bio *bio) ...@@ -2230,7 +2243,7 @@ bool blk_throtl_bio(struct bio *bio)
/* /*
* @bio passed through this layer without being throttled. * @bio passed through this layer without being throttled.
* Climb up the ladder. If we''re already at the top, it * Climb up the ladder. If we're already at the top, it
* can be executed directly. * can be executed directly.
*/ */
qn = &tg->qnode_on_parent[rw]; qn = &tg->qnode_on_parent[rw];
......
...@@ -29,6 +29,12 @@ struct blk_flush_queue { ...@@ -29,6 +29,12 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock; spinlock_t mq_flush_lock;
}; };
enum bio_merge_status {
BIO_MERGE_OK,
BIO_MERGE_NONE,
BIO_MERGE_FAILED,
};
extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *blk_requestq_cachep;
extern struct kobj_type blk_queue_ktype; extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida; extern struct ida blk_queue_ida;
...@@ -169,14 +175,19 @@ static inline void blk_integrity_del(struct gendisk *disk) ...@@ -169,14 +175,19 @@ static inline void blk_integrity_del(struct gendisk *disk)
unsigned long blk_rq_timeout(unsigned long timeout); unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req); void blk_add_timer(struct request *req);
bool bio_attempt_front_merge(struct request *req, struct bio *bio, enum bio_merge_status bio_attempt_front_merge(struct request *req,
unsigned int nr_segs); struct bio *bio,
bool bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs);
unsigned int nr_segs); enum bio_merge_status bio_attempt_back_merge(struct request *req,
bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio,
struct bio *bio); unsigned int nr_segs);
enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
struct request *req,
struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq); unsigned int nr_segs, struct request **same_queue_rq);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs);
void blk_account_io_start(struct request *req); void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now); void blk_account_io_done(struct request *req, u64 now);
...@@ -350,7 +361,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); ...@@ -350,7 +361,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf);
#define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2 #define ADDPART_FLAG_WHOLEDISK 2
void delete_partition(struct gendisk *disk, struct hd_struct *part); void delete_partition(struct hd_struct *part);
int bdev_add_partition(struct block_device *bdev, int partno, int bdev_add_partition(struct block_device *bdev, int partno,
sector_t start, sector_t length); sector_t start, sector_t length);
int bdev_del_partition(struct block_device *bdev, int partno); int bdev_del_partition(struct block_device *bdev, int partno);
......
...@@ -207,7 +207,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) ...@@ -207,7 +207,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
BUG_ON(!req->nr_phys_segments); BUG_ON(!req->nr_phys_segments);
buf->sg_list = kzalloc(sz, GFP_KERNEL); buf->sg_list = kmalloc(sz, GFP_KERNEL);
if (!buf->sg_list) if (!buf->sg_list)
return -ENOMEM; return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments); sg_init_table(buf->sg_list, req->nr_phys_segments);
......
...@@ -50,14 +50,13 @@ static void disk_release_events(struct gendisk *disk); ...@@ -50,14 +50,13 @@ static void disk_release_events(struct gendisk *disk);
* zero and will not be set to zero * zero and will not be set to zero
*/ */
void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
bool revalidate) bool update_bdev)
{ {
sector_t capacity = get_capacity(disk); sector_t capacity = get_capacity(disk);
set_capacity(disk, size); set_capacity(disk, size);
if (update_bdev)
if (revalidate) revalidate_disk_size(disk, true);
revalidate_disk(disk);
if (capacity != size && capacity != 0 && size != 0) { if (capacity != size && capacity != 0 && size != 0) {
char *envp[] = { "RESIZE=1", NULL }; char *envp[] = { "RESIZE=1", NULL };
...@@ -110,8 +109,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) ...@@ -110,8 +109,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
} }
} }
static unsigned int part_in_flight(struct request_queue *q, static unsigned int part_in_flight(struct hd_struct *part)
struct hd_struct *part)
{ {
unsigned int inflight = 0; unsigned int inflight = 0;
int cpu; int cpu;
...@@ -126,8 +124,7 @@ static unsigned int part_in_flight(struct request_queue *q, ...@@ -126,8 +124,7 @@ static unsigned int part_in_flight(struct request_queue *q,
return inflight; return inflight;
} }
static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2])
unsigned int inflight[2])
{ {
int cpu; int cpu;
...@@ -676,11 +673,23 @@ static int exact_lock(dev_t devt, void *data) ...@@ -676,11 +673,23 @@ static int exact_lock(dev_t devt, void *data)
return 0; return 0;
} }
static void disk_scan_partitions(struct gendisk *disk)
{
struct block_device *bdev;
if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
return;
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
if (!IS_ERR(bdev))
blkdev_put(bdev, FMODE_READ);
}
static void register_disk(struct device *parent, struct gendisk *disk, static void register_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups) const struct attribute_group **groups)
{ {
struct device *ddev = disk_to_dev(disk); struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
struct disk_part_iter piter; struct disk_part_iter piter;
struct hd_struct *part; struct hd_struct *part;
int err; int err;
...@@ -722,25 +731,8 @@ static void register_disk(struct device *parent, struct gendisk *disk, ...@@ -722,25 +731,8 @@ static void register_disk(struct device *parent, struct gendisk *disk,
return; return;
} }
/* No minors to use for partitions */ disk_scan_partitions(disk);
if (!disk_part_scan_enabled(disk))
goto exit;
/* No such device (e.g., media were just removed) */
if (!get_capacity(disk))
goto exit;
bdev = bdget_disk(disk, 0);
if (!bdev)
goto exit;
bdev->bd_invalidated = 1;
err = blkdev_get(bdev, FMODE_READ, NULL);
if (err < 0)
goto exit;
blkdev_put(bdev, FMODE_READ);
exit:
/* announce disk after possible partitions are created */ /* announce disk after possible partitions are created */
dev_set_uevent_suppress(ddev, 0); dev_set_uevent_suppress(ddev, 0);
kobject_uevent(&ddev->kobj, KOBJ_ADD); kobject_uevent(&ddev->kobj, KOBJ_ADD);
...@@ -913,7 +905,7 @@ void del_gendisk(struct gendisk *disk) ...@@ -913,7 +905,7 @@ void del_gendisk(struct gendisk *disk)
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next(&piter))) { while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(disk, part->partno); invalidate_partition(disk, part->partno);
delete_partition(disk, part); delete_partition(part);
} }
disk_part_iter_exit(&piter); disk_part_iter_exit(&piter);
...@@ -1301,7 +1293,7 @@ ssize_t part_stat_show(struct device *dev, ...@@ -1301,7 +1293,7 @@ ssize_t part_stat_show(struct device *dev,
if (queue_is_mq(q)) if (queue_is_mq(q))
inflight = blk_mq_in_flight(q, p); inflight = blk_mq_in_flight(q, p);
else else
inflight = part_in_flight(q, p); inflight = part_in_flight(p);
return sprintf(buf, return sprintf(buf,
"%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u "
...@@ -1343,7 +1335,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, ...@@ -1343,7 +1335,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
if (queue_is_mq(q)) if (queue_is_mq(q))
blk_mq_in_flight_rw(q, p, inflight); blk_mq_in_flight_rw(q, p, inflight);
else else
part_in_flight_rw(q, p, inflight); part_in_flight_rw(p, inflight);
return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
} }
...@@ -1623,7 +1615,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) ...@@ -1623,7 +1615,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
if (queue_is_mq(gp->queue)) if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd); inflight = blk_mq_in_flight(gp->queue, hd);
else else
inflight = part_in_flight(gp->queue, hd); inflight = part_in_flight(hd);
seq_printf(seqf, "%4d %7d %s " seq_printf(seqf, "%4d %7d %s "
"%lu %lu %lu %u " "%lu %lu %lu %u "
...@@ -1729,45 +1721,48 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) ...@@ -1729,45 +1721,48 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
} }
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
if (disk) { if (!disk)
disk->part0.dkstats = alloc_percpu(struct disk_stats); return NULL;
if (!disk->part0.dkstats) {
kfree(disk);
return NULL;
}
init_rwsem(&disk->lookup_sem);
disk->node_id = node_id;
if (disk_expand_part_tbl(disk, 0)) {
free_percpu(disk->part0.dkstats);
kfree(disk);
return NULL;
}
ptbl = rcu_dereference_protected(disk->part_tbl, 1);
rcu_assign_pointer(ptbl->part[0], &disk->part0);
/* disk->part0.dkstats = alloc_percpu(struct disk_stats);
* set_capacity() and get_capacity() currently don't use if (!disk->part0.dkstats)
* seqcounter to read/update the part0->nr_sects. Still init goto out_free_disk;
* the counter as we can read the sectors in IO submission
* patch using seqence counters.
*
* TODO: Ideally set_capacity() and get_capacity() should be
* converted to make use of bd_mutex and sequence counters.
*/
hd_sects_seq_init(&disk->part0);
if (hd_ref_init(&disk->part0)) {
hd_free_part(&disk->part0);
kfree(disk);
return NULL;
}
disk->minors = minors; init_rwsem(&disk->lookup_sem);
rand_initialize_disk(disk); disk->node_id = node_id;
disk_to_dev(disk)->class = &block_class; if (disk_expand_part_tbl(disk, 0)) {
disk_to_dev(disk)->type = &disk_type; free_percpu(disk->part0.dkstats);
device_initialize(disk_to_dev(disk)); goto out_free_disk;
} }
ptbl = rcu_dereference_protected(disk->part_tbl, 1);
rcu_assign_pointer(ptbl->part[0], &disk->part0);
/*
* set_capacity() and get_capacity() currently don't use
* seqcounter to read/update the part0->nr_sects. Still init
* the counter as we can read the sectors in IO submission
* patch using seqence counters.
*
* TODO: Ideally set_capacity() and get_capacity() should be
* converted to make use of bd_mutex and sequence counters.
*/
hd_sects_seq_init(&disk->part0);
if (hd_ref_init(&disk->part0))
goto out_free_part0;
disk->minors = minors;
rand_initialize_disk(disk);
disk_to_dev(disk)->class = &block_class;
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
return disk; return disk;
out_free_part0:
hd_free_part(&disk->part0);
out_free_disk:
kfree(disk);
return NULL;
} }
EXPORT_SYMBOL(__alloc_disk_node); EXPORT_SYMBOL(__alloc_disk_node);
...@@ -2052,7 +2047,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask) ...@@ -2052,7 +2047,7 @@ void disk_flush_events(struct gendisk *disk, unsigned int mask)
* CONTEXT: * CONTEXT:
* Might sleep. * Might sleep.
*/ */
unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
{ {
struct disk_events *ev = disk->ev; struct disk_events *ev = disk->ev;
unsigned int pending; unsigned int pending;
...@@ -2090,6 +2085,33 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) ...@@ -2090,6 +2085,33 @@ unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
return pending; return pending;
} }
/**
* bdev_check_media_change - check if a removable media has been changed
* @bdev: block device to check
*
* Check whether a removable media has been changed, and attempt to free all
* dentries and inodes and invalidates all block device page cache entries in
* that case.
*
* Returns %true if the block device changed, or %false if not.
*/
bool bdev_check_media_change(struct block_device *bdev)
{
unsigned int events;
events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE |
DISK_EVENT_EJECT_REQUEST);
if (!(events & DISK_EVENT_MEDIA_CHANGE))
return false;
if (__invalidate_device(bdev, true))
pr_warn("VFS: busy inodes on changed media %s\n",
bdev->bd_disk->disk_name);
set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
return true;
}
EXPORT_SYMBOL(bdev_check_media_change);
/* /*
* Separate this part out so that a different pointer for clearing_ptr can be * Separate this part out so that a different pointer for clearing_ptr can be
* passed in for disk_clear_events. * passed in for disk_clear_events.
......
This diff is collapsed.
...@@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio) ...@@ -69,7 +69,7 @@ int ioprio_check_cap(int ioprio)
switch (class) { switch (class) {
case IOPRIO_CLASS_RT: case IOPRIO_CLASS_RT:
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_NICE) && !capable(CAP_SYS_ADMIN))
return -EPERM; return -EPERM;
fallthrough; fallthrough;
/* rt has prio field too */ /* rt has prio field too */
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -37,8 +37,6 @@ const unsigned char scsi_command_size_tbl[8] = ...@@ -37,8 +37,6 @@ const unsigned char scsi_command_size_tbl[8] =
}; };
EXPORT_SYMBOL(scsi_command_size_tbl); EXPORT_SYMBOL(scsi_command_size_tbl);
#include <scsi/sg.h>
static int sg_get_version(int __user *p) static int sg_get_version(int __user *p)
{ {
static const int sg_version_num = 30527; static const int sg_version_num = 30527;
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment