Commit caf292ae authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.19/core' of git://git.kernel.dk/linux-block

Pull block driver core update from Jens Axboe:
 "This is the pull request for the core block IO changes for 3.19.  Not
  a huge round this time, mostly lots of little good fixes:

   - Fix a bug in sysfs blktrace interface causing a NULL pointer
     dereference, when enabled/disabled through that API.  From Arianna
     Avanzini.

   - Various updates/fixes/improvements for blk-mq:

        - A set of updates from Bart, mostly fixing buts in the tag
          handling.

        - Cleanup/code consolidation from Christoph.

        - Extend queue_rq API to be able to handle batching issues of IO
          requests. NVMe will utilize this shortly. From me.

        - A few tag and request handling updates from me.

        - Cleanup of the preempt handling for running queues from Paolo.

        - Prevent running of unmapped hardware queues from Ming Lei.

        - Move the kdump memory limiting check to be in the correct
          location, from Shaohua.

        - Initialize all software queues at init time from Takashi. This
          prevents a kobject warning when CPUs are brought online that
          weren't online when a queue was registered.

   - Single writeback fix for I_DIRTY clearing from Tejun.  Queued with
     the core IO changes, since it's just a single fix.

   - Version X of the __bio_add_page() segment addition retry from
     Maurizio.  Hope the Xth time is the charm.

   - Documentation fixup for IO scheduler merging from Jan.

   - Introduce (and use) generic IO stat accounting helpers for non-rq
     drivers, from Gu Zheng.

   - Kill off artificial limiting of max sectors in a request from
     Christoph"

* 'for-3.19/core' of git://git.kernel.dk/linux-block: (26 commits)
  bio: modify __bio_add_page() to accept pages that don't start a new segment
  blk-mq: Fix uninitialized kobject at CPU hotplugging
  blktrace: don't let the sysfs interface remove trace from running list
  blk-mq: Use all available hardware queues
  blk-mq: Micro-optimize bt_get()
  blk-mq: Fix a race between bt_clear_tag() and bt_get()
  blk-mq: Avoid that __bt_get_word() wraps multiple times
  blk-mq: Fix a use-after-free
  blk-mq: prevent unmapped hw queue from being scheduled
  blk-mq: re-check for available tags after running the hardware queue
  blk-mq: fix hang in bt_get()
  blk-mq: move the kdump check to blk_mq_alloc_tag_set
  blk-mq: cleanup tag free handling
  blk-mq: use 'nr_cpu_ids' as highest CPU ID count for hwq <-> cpu map
  blk: introduce generic io stat accounting help function
  blk-mq: handle the single queue case in blk_mq_hctx_next_cpu
  genhd: check for int overflow in disk_expand_part_tbl()
  blk-mq: add blk_mq_free_hctx_request()
  blk-mq: export blk_mq_free_request()
  blk-mq: use get_cpu/put_cpu instead of preempt_disable/preempt_enable
  ...
parents 8f4385d5 fcbf6a08
...@@ -942,7 +942,11 @@ elevator_allow_merge_fn called whenever the block layer determines ...@@ -942,7 +942,11 @@ elevator_allow_merge_fn called whenever the block layer determines
request safely. The io scheduler may still request safely. The io scheduler may still
want to stop a merge at this point if it want to stop a merge at this point if it
results in some sort of conflict internally, results in some sort of conflict internally,
this hook allows it to do that. this hook allows it to do that. Note however
that two *requests* can still be merged at later
time. Currently the io scheduler has no way to
prevent that. It can only learn about the fact
from elevator_merge_req_fn callback.
elevator_dispatch_fn* fills the dispatch queue with ready requests. elevator_dispatch_fn* fills the dispatch queue with ready requests.
I/O schedulers are free to postpone requests by I/O schedulers are free to postpone requests by
......
...@@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page ...@@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
} }
} }
bio->bi_iter.bi_size += len;
goto done; goto done;
} }
...@@ -764,28 +765,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page ...@@ -764,28 +765,31 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
return 0; return 0;
/* /*
* we might lose a segment or two here, but rather that than * setup the new entry, we might clear it again later if we
* make this too complex. * cannot add the page
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt];
bvec->bv_page = page;
bvec->bv_len = len;
bvec->bv_offset = offset;
bio->bi_vcnt++;
bio->bi_phys_segments++;
bio->bi_iter.bi_size += len;
/*
* Perform a recount if the number of segments is greater
* than queue_max_segments(q).
*/ */
while (bio->bi_phys_segments >= queue_max_segments(q)) { while (bio->bi_phys_segments > queue_max_segments(q)) {
if (retried_segments) if (retried_segments)
return 0; goto failed;
retried_segments = 1; retried_segments = 1;
blk_recount_segments(q, bio); blk_recount_segments(q, bio);
} }
/*
* setup the new entry, we might clear it again later if we
* cannot add the page
*/
bvec = &bio->bi_io_vec[bio->bi_vcnt];
bvec->bv_page = page;
bvec->bv_len = len;
bvec->bv_offset = offset;
/* /*
* if queue has other restrictions (eg varying max sector size * if queue has other restrictions (eg varying max sector size
* depending on offset), it can specify a merge_bvec_fn in the * depending on offset), it can specify a merge_bvec_fn in the
...@@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page ...@@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
struct bvec_merge_data bvm = { struct bvec_merge_data bvm = {
.bi_bdev = bio->bi_bdev, .bi_bdev = bio->bi_bdev,
.bi_sector = bio->bi_iter.bi_sector, .bi_sector = bio->bi_iter.bi_sector,
.bi_size = bio->bi_iter.bi_size, .bi_size = bio->bi_iter.bi_size - len,
.bi_rw = bio->bi_rw, .bi_rw = bio->bi_rw,
}; };
...@@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page ...@@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
* merge_bvec_fn() returns number of bytes it can accept * merge_bvec_fn() returns number of bytes it can accept
* at this offset * at this offset
*/ */
if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len)
bvec->bv_page = NULL; goto failed;
bvec->bv_len = 0;
bvec->bv_offset = 0;
return 0;
}
} }
/* If we may be able to merge these biovecs, force a recount */ /* If we may be able to merge these biovecs, force a recount */
if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
bio->bi_flags &= ~(1 << BIO_SEG_VALID); bio->bi_flags &= ~(1 << BIO_SEG_VALID);
bio->bi_vcnt++;
bio->bi_phys_segments++;
done: done:
bio->bi_iter.bi_size += len;
return len; return len;
failed:
bvec->bv_page = NULL;
bvec->bv_len = 0;
bvec->bv_offset = 0;
bio->bi_vcnt--;
bio->bi_iter.bi_size -= len;
blk_recount_segments(q, bio);
return 0;
} }
/** /**
...@@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio) ...@@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio)
} }
} }
void generic_start_io_acct(int rw, unsigned long sectors,
struct hd_struct *part)
{
int cpu = part_stat_lock();
part_round_stats(cpu, part);
part_stat_inc(cpu, part, ios[rw]);
part_stat_add(cpu, part, sectors[rw], sectors);
part_inc_in_flight(part, rw);
part_stat_unlock();
}
EXPORT_SYMBOL(generic_start_io_acct);
void generic_end_io_acct(int rw, struct hd_struct *part,
unsigned long start_time)
{
unsigned long duration = jiffies - start_time;
int cpu = part_stat_lock();
part_stat_add(cpu, part, ticks[rw], duration);
part_round_stats(cpu, part);
part_dec_in_flight(part, rw);
part_stat_unlock();
}
EXPORT_SYMBOL(generic_end_io_acct);
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
void bio_flush_dcache_pages(struct bio *bi) void bio_flush_dcache_pages(struct bio *bi)
{ {
......
...@@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q) ...@@ -525,6 +525,9 @@ void blk_cleanup_queue(struct request_queue *q)
del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
blk_sync_queue(q); blk_sync_queue(q);
if (q->mq_ops)
blk_mq_free_queue(q);
spin_lock_irq(lock); spin_lock_irq(lock);
if (q->queue_lock != &q->__queue_lock) if (q->queue_lock != &q->__queue_lock)
q->queue_lock = &q->__queue_lock; q->queue_lock = &q->__queue_lock;
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
const int cpu) const int cpu)
{ {
return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); return cpu * nr_queues / nr_cpus;
} }
static int get_first_sibling(unsigned int cpu) static int get_first_sibling(unsigned int cpu)
...@@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) ...@@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
unsigned int *map; unsigned int *map;
/* If cpus are offline, map them to first hctx */ /* If cpus are offline, map them to first hctx */
map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL,
set->numa_node); set->numa_node);
if (!map) if (!map)
return NULL; return NULL;
......
...@@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q) ...@@ -390,16 +390,15 @@ static void blk_mq_sysfs_init(struct request_queue *q)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx; struct blk_mq_ctx *ctx;
int i, j; int i;
kobject_init(&q->mq_kobj, &blk_mq_ktype); kobject_init(&q->mq_kobj, &blk_mq_ktype);
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i)
kobject_init(&hctx->kobj, &blk_mq_hw_ktype); kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
hctx_for_each_ctx(hctx, ctx, j) queue_for_each_ctx(q, ctx, i)
kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype);
}
} }
/* see blk_register_queue() */ /* see blk_register_queue() */
......
...@@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, ...@@ -137,6 +137,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
{ {
int tag, org_last_tag, end; int tag, org_last_tag, end;
bool wrap = last_tag != 0;
org_last_tag = last_tag; org_last_tag = last_tag;
end = bm->depth; end = bm->depth;
...@@ -148,15 +149,16 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) ...@@ -148,15 +149,16 @@ static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag)
* We started with an offset, start from 0 to * We started with an offset, start from 0 to
* exhaust the map. * exhaust the map.
*/ */
if (org_last_tag && last_tag) { if (wrap) {
end = last_tag; wrap = false;
end = org_last_tag;
last_tag = 0; last_tag = 0;
goto restart; goto restart;
} }
return -1; return -1;
} }
last_tag = tag + 1; last_tag = tag + 1;
} while (test_and_set_bit_lock(tag, &bm->word)); } while (test_and_set_bit(tag, &bm->word));
return tag; return tag;
} }
...@@ -246,14 +248,29 @@ static int bt_get(struct blk_mq_alloc_data *data, ...@@ -246,14 +248,29 @@ static int bt_get(struct blk_mq_alloc_data *data,
if (!(data->gfp & __GFP_WAIT)) if (!(data->gfp & __GFP_WAIT))
return -1; return -1;
bs = bt_wait_ptr(bt, hctx);
do { do {
bs = bt_wait_ptr(bt, hctx);
prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE);
tag = __bt_get(hctx, bt, last_tag); tag = __bt_get(hctx, bt, last_tag);
if (tag != -1) if (tag != -1)
break; break;
/*
* We're out of tags on this hardware queue, kick any
* pending IO submits before going to sleep waiting for
* some to complete.
*/
blk_mq_run_hw_queue(hctx, false);
/*
* Retry tag allocation after running the hardware queue,
* as running the queue may also have found completions.
*/
tag = __bt_get(hctx, bt, last_tag);
if (tag != -1)
break;
blk_mq_put_ctx(data->ctx); blk_mq_put_ctx(data->ctx);
io_schedule(); io_schedule();
...@@ -268,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data, ...@@ -268,8 +285,6 @@ static int bt_get(struct blk_mq_alloc_data *data,
hctx = data->hctx; hctx = data->hctx;
bt = &hctx->tags->bitmap_tags; bt = &hctx->tags->bitmap_tags;
} }
finish_wait(&bs->wait, &wait);
bs = bt_wait_ptr(bt, hctx);
} while (1); } while (1);
finish_wait(&bs->wait, &wait); finish_wait(&bs->wait, &wait);
...@@ -340,11 +355,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) ...@@ -340,11 +355,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
struct bt_wait_state *bs; struct bt_wait_state *bs;
int wait_cnt; int wait_cnt;
/* clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word);
* The unlock memory barrier need to order access to req in free
* path and clearing tag bit /* Ensure that the wait list checks occur after clear_bit(). */
*/ smp_mb();
clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word);
bs = bt_wake_ptr(bt); bs = bt_wake_ptr(bt);
if (!bs) if (!bs)
...@@ -360,21 +374,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) ...@@ -360,21 +374,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag)
} }
} }
static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag)
{
BUG_ON(tag >= tags->nr_tags);
bt_clear_tag(&tags->bitmap_tags, tag);
}
static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags,
unsigned int tag)
{
BUG_ON(tag >= tags->nr_reserved_tags);
bt_clear_tag(&tags->breserved_tags, tag);
}
void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
unsigned int *last_tag) unsigned int *last_tag)
{ {
...@@ -383,10 +382,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, ...@@ -383,10 +382,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag,
if (tag >= tags->nr_reserved_tags) { if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags; const int real_tag = tag - tags->nr_reserved_tags;
__blk_mq_put_tag(tags, real_tag); BUG_ON(real_tag >= tags->nr_tags);
bt_clear_tag(&tags->bitmap_tags, real_tag);
*last_tag = real_tag; *last_tag = real_tag;
} else } else {
__blk_mq_put_reserved_tag(tags, tag); BUG_ON(tag >= tags->nr_reserved_tags);
bt_clear_tag(&tags->breserved_tags, tag);
}
} }
static void bt_for_each(struct blk_mq_hw_ctx *hctx, static void bt_for_each(struct blk_mq_hw_ctx *hctx,
......
...@@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, ...@@ -279,17 +279,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
blk_mq_queue_exit(q); blk_mq_queue_exit(q);
} }
void blk_mq_free_request(struct request *rq) void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
{ {
struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_ctx *ctx = rq->mq_ctx;
struct blk_mq_hw_ctx *hctx;
struct request_queue *q = rq->q;
ctx->rq_completed[rq_is_sync(rq)]++; ctx->rq_completed[rq_is_sync(rq)]++;
hctx = q->mq_ops->map_queue(q, ctx->cpu);
__blk_mq_free_request(hctx, ctx, rq); __blk_mq_free_request(hctx, ctx, rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request);
void blk_mq_free_request(struct request *rq)
{
struct blk_mq_hw_ctx *hctx;
struct request_queue *q = rq->q;
hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu);
blk_mq_free_hctx_request(hctx, rq);
} }
EXPORT_SYMBOL_GPL(blk_mq_free_request);
inline void __blk_mq_end_request(struct request *rq, int error) inline void __blk_mq_end_request(struct request *rq, int error)
{ {
...@@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv) ...@@ -591,7 +599,7 @@ static void blk_mq_rq_timer(unsigned long priv)
* If not software queues are currently mapped to this * If not software queues are currently mapped to this
* hardware queue, there's nothing to check * hardware queue, there's nothing to check
*/ */
if (!hctx->nr_ctx || !hctx->tags) if (!blk_mq_hw_queue_mapped(hctx))
continue; continue;
blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
...@@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) ...@@ -690,6 +698,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
struct request *rq; struct request *rq;
LIST_HEAD(rq_list); LIST_HEAD(rq_list);
LIST_HEAD(driver_list);
struct list_head *dptr;
int queued; int queued;
WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
...@@ -715,17 +725,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) ...@@ -715,17 +725,28 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
spin_unlock(&hctx->lock); spin_unlock(&hctx->lock);
} }
/*
* Start off with dptr being NULL, so we start the first request
* immediately, even if we have more pending.
*/
dptr = NULL;
/* /*
* Now process all the entries, sending them to the driver. * Now process all the entries, sending them to the driver.
*/ */
queued = 0; queued = 0;
while (!list_empty(&rq_list)) { while (!list_empty(&rq_list)) {
struct blk_mq_queue_data bd;
int ret; int ret;
rq = list_first_entry(&rq_list, struct request, queuelist); rq = list_first_entry(&rq_list, struct request, queuelist);
list_del_init(&rq->queuelist); list_del_init(&rq->queuelist);
ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); bd.rq = rq;
bd.list = dptr;
bd.last = list_empty(&rq_list);
ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) { switch (ret) {
case BLK_MQ_RQ_QUEUE_OK: case BLK_MQ_RQ_QUEUE_OK:
queued++; queued++;
...@@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) ...@@ -744,6 +765,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
if (ret == BLK_MQ_RQ_QUEUE_BUSY) if (ret == BLK_MQ_RQ_QUEUE_BUSY)
break; break;
/*
* We've done the first request. If we have more than 1
* left in the list, set dptr to defer issue.
*/
if (!dptr && rq_list.next != rq_list.prev)
dptr = &driver_list;
} }
if (!queued) if (!queued)
...@@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) ...@@ -770,10 +798,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
*/ */
static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
{ {
int cpu = hctx->next_cpu; if (hctx->queue->nr_hw_queues == 1)
return WORK_CPU_UNBOUND;
if (--hctx->next_cpu_batch <= 0) { if (--hctx->next_cpu_batch <= 0) {
int next_cpu; int cpu = hctx->next_cpu, next_cpu;
next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
if (next_cpu >= nr_cpu_ids) if (next_cpu >= nr_cpu_ids)
...@@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) ...@@ -781,26 +810,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
hctx->next_cpu = next_cpu; hctx->next_cpu = next_cpu;
hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
return cpu;
} }
return cpu; return hctx->next_cpu;
} }
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{ {
if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) ||
!blk_mq_hw_queue_mapped(hctx)))
return; return;
if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) if (!async) {
__blk_mq_run_hw_queue(hctx); int cpu = get_cpu();
else if (hctx->queue->nr_hw_queues == 1) if (cpumask_test_cpu(cpu, hctx->cpumask)) {
kblockd_schedule_delayed_work(&hctx->run_work, 0); __blk_mq_run_hw_queue(hctx);
else { put_cpu();
unsigned int cpu; return;
}
cpu = blk_mq_hctx_next_cpu(hctx); put_cpu();
kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
} }
kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
&hctx->run_work, 0);
} }
void blk_mq_run_queues(struct request_queue *q, bool async) void blk_mq_run_queues(struct request_queue *q, bool async)
...@@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) ...@@ -814,9 +849,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async)
test_bit(BLK_MQ_S_STOPPED, &hctx->state)) test_bit(BLK_MQ_S_STOPPED, &hctx->state))
continue; continue;
preempt_disable();
blk_mq_run_hw_queue(hctx, async); blk_mq_run_hw_queue(hctx, async);
preempt_enable();
} }
} }
EXPORT_SYMBOL(blk_mq_run_queues); EXPORT_SYMBOL(blk_mq_run_queues);
...@@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) ...@@ -843,9 +876,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{ {
clear_bit(BLK_MQ_S_STOPPED, &hctx->state); clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
preempt_disable();
blk_mq_run_hw_queue(hctx, false); blk_mq_run_hw_queue(hctx, false);
preempt_enable();
} }
EXPORT_SYMBOL(blk_mq_start_hw_queue); EXPORT_SYMBOL(blk_mq_start_hw_queue);
...@@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) ...@@ -870,9 +901,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
continue; continue;
clear_bit(BLK_MQ_S_STOPPED, &hctx->state); clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
preempt_disable();
blk_mq_run_hw_queue(hctx, async); blk_mq_run_hw_queue(hctx, async);
preempt_enable();
} }
} }
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
...@@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work) ...@@ -898,16 +927,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work)
void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{ {
unsigned long tmo = msecs_to_jiffies(msecs); if (unlikely(!blk_mq_hw_queue_mapped(hctx)))
return;
if (hctx->queue->nr_hw_queues == 1)
kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
else {
unsigned int cpu;
cpu = blk_mq_hctx_next_cpu(hctx); kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx),
kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); &hctx->delay_work, msecs_to_jiffies(msecs));
}
} }
EXPORT_SYMBOL(blk_mq_delay_queue); EXPORT_SYMBOL(blk_mq_delay_queue);
...@@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) ...@@ -1162,7 +1186,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
goto run_queue; goto run_queue;
} }
if (is_sync) { /*
* If the driver supports defer issued based on 'last', then
* queue it up like normal since we can potentially save some
* CPU this way.
*/
if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) {
struct blk_mq_queue_data bd = {
.rq = rq,
.list = NULL,
.last = 1
};
int ret; int ret;
blk_mq_bio_to_request(rq, bio); blk_mq_bio_to_request(rq, bio);
...@@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) ...@@ -1172,7 +1206,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
* error (busy), just add it to our list as we previously * error (busy), just add it to our list as we previously
* would have done * would have done
*/ */
ret = q->mq_ops->queue_rq(data.hctx, rq, true); ret = q->mq_ops->queue_rq(data.hctx, &bd);
if (ret == BLK_MQ_RQ_QUEUE_OK) if (ret == BLK_MQ_RQ_QUEUE_OK)
goto done; goto done;
else { else {
...@@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) ...@@ -1784,16 +1818,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
if (!ctx) if (!ctx)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->queue_depth = min(64U, set->queue_depth);
}
hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
set->numa_node); set->numa_node);
...@@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) ...@@ -2067,6 +2091,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->queue_depth = BLK_MQ_MAX_DEPTH; set->queue_depth = BLK_MQ_MAX_DEPTH;
} }
/*
* If a crashdump is active, then we are potentially in a very
* memory constrained environment. Limit us to 1 queue and
* 64 tags to prevent using too much memory.
*/
if (is_kdump_kernel()) {
set->nr_hw_queues = 1;
set->queue_depth = min(64U, set->queue_depth);
}
set->tags = kmalloc_node(set->nr_hw_queues * set->tags = kmalloc_node(set->nr_hw_queues *
sizeof(struct blk_mq_tags *), sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node); GFP_KERNEL, set->numa_node);
......
...@@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, ...@@ -115,4 +115,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
data->hctx = hctx; data->hctx = hctx;
} }
static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
{
return hctx->nr_ctx && hctx->tags;
}
#endif #endif
...@@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ ...@@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
__func__, max_hw_sectors); __func__, max_hw_sectors);
} }
limits->max_hw_sectors = max_hw_sectors; limits->max_sectors = limits->max_hw_sectors = max_hw_sectors;
limits->max_sectors = min_t(unsigned int, max_hw_sectors,
BLK_DEF_MAX_SECTORS);
} }
EXPORT_SYMBOL(blk_limits_max_hw_sectors); EXPORT_SYMBOL(blk_limits_max_hw_sectors);
......
...@@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) ...@@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
* Currently, its primary task it to free all the &struct request * Currently, its primary task it to free all the &struct request
* structures that were allocated to the queue and the queue itself. * structures that were allocated to the queue and the queue itself.
* *
* Caveat: * Note:
* Hopefully the low level driver will have finished any * The low level driver must have finished any outstanding requests first
* outstanding requests first... * via blk_cleanup_queue().
**/ **/
static void blk_release_queue(struct kobject *kobj) static void blk_release_queue(struct kobject *kobj)
{ {
struct request_queue *q = struct request_queue *q =
container_of(kobj, struct request_queue, kobj); container_of(kobj, struct request_queue, kobj);
blk_sync_queue(q);
blkcg_exit_queue(q); blkcg_exit_queue(q);
if (q->elevator) { if (q->elevator) {
...@@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj) ...@@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj)
if (q->queue_tags) if (q->queue_tags)
__blk_queue_free_tags(q); __blk_queue_free_tags(q);
if (q->mq_ops) if (!q->mq_ops)
blk_mq_free_queue(q);
else
blk_free_flush_queue(q->fq); blk_free_flush_queue(q->fq);
blk_trace_shutdown(q); blk_trace_shutdown(q);
......
...@@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) ...@@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *old_ptbl = disk->part_tbl;
struct disk_part_tbl *new_ptbl; struct disk_part_tbl *new_ptbl;
int len = old_ptbl ? old_ptbl->len : 0; int len = old_ptbl ? old_ptbl->len : 0;
int target = partno + 1; int i, target;
size_t size; size_t size;
int i;
/*
* check for int overflow, since we can get here from blkpg_ioctl()
* with a user passed 'partno'.
*/
target = partno + 1;
if (target < 0)
return -EINVAL;
/* disk_max_parts() is zero during initialization, ignore if so */ /* disk_max_parts() is zero during initialization, ignore if so */
if (disk_max_parts(disk) && target > disk_max_parts(disk)) if (disk_max_parts(disk) && target > disk_max_parts(disk))
......
...@@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp) ...@@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp)
WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->flags & DEVFL_TKILL);
WARN_ON(d->gd); WARN_ON(d->gd);
WARN_ON(d->flags & DEVFL_UP); WARN_ON(d->flags & DEVFL_UP);
blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); blk_queue_max_hw_sectors(q, 1024);
q->backing_dev_info.name = "aoe"; q->backing_dev_info.name = "aoe";
q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE;
d->bufpool = mp; d->bufpool = mp;
......
...@@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, ...@@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
return false; return false;
} }
static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
bool last) const struct blk_mq_queue_data *bd)
{ {
struct request *rq = bd->rq;
int ret; int ret;
if (unlikely(mtip_check_unal_depth(hctx, rq))) if (unlikely(mtip_check_unal_depth(hctx, rq)))
......
...@@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q) ...@@ -313,15 +313,15 @@ static void null_request_fn(struct request_queue *q)
} }
} }
static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
bool last) const struct blk_mq_queue_data *bd)
{ {
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
cmd->rq = rq; cmd->rq = bd->rq;
cmd->nq = hctx->driver_data; cmd->nq = hctx->driver_data;
blk_mq_start_request(rq); blk_mq_start_request(bd->rq);
null_handle_cmd(cmd); null_handle_cmd(cmd);
return BLK_MQ_RQ_QUEUE_OK; return BLK_MQ_RQ_QUEUE_OK;
......
...@@ -159,10 +159,11 @@ static void virtblk_done(struct virtqueue *vq) ...@@ -159,10 +159,11 @@ static void virtblk_done(struct virtqueue *vq)
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
} }
static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
bool last) const struct blk_mq_queue_data *bd)
{ {
struct virtio_blk *vblk = hctx->queue->queuedata; struct virtio_blk *vblk = hctx->queue->queuedata;
struct request *req = bd->rq;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
unsigned long flags; unsigned long flags;
unsigned int num; unsigned int num;
...@@ -223,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, ...@@ -223,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req,
return BLK_MQ_RQ_QUEUE_ERROR; return BLK_MQ_RQ_QUEUE_ERROR;
} }
if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
notify = true; notify = true;
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
......
...@@ -1947,9 +1947,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd) ...@@ -1947,9 +1947,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request); blk_mq_complete_request(cmd->request);
} }
static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
bool last) const struct blk_mq_queue_data *bd)
{ {
struct request *req = bd->rq;
struct request_queue *q = req->q; struct request_queue *q = req->q;
struct scsi_device *sdev = q->queuedata; struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host; struct Scsi_Host *shost = sdev->host;
......
...@@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) ...@@ -479,12 +479,28 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
* write_inode() * write_inode()
*/ */
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state &= ~I_DIRTY_PAGES;
dirty = inode->i_state & I_DIRTY; dirty = inode->i_state & I_DIRTY;
inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC); inode->i_state &= ~I_DIRTY;
/*
* Paired with smp_mb() in __mark_inode_dirty(). This allows
* __mark_inode_dirty() to test i_state without grabbing i_lock -
* either they see the I_DIRTY bits cleared or we see the dirtied
* inode.
*
* I_DIRTY_PAGES is always cleared together above even if @mapping
* still has dirty pages. The flag is reinstated after smp_mb() if
* necessary. This guarantees that either __mark_inode_dirty()
* sees clear I_DIRTY_PAGES or we see PAGECACHE_TAG_DIRTY.
*/
smp_mb();
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
/* Don't write the inode if only I_DIRTY_PAGES was set */ /* Don't write the inode if only I_DIRTY_PAGES was set */
if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
int err = write_inode(inode, wbc); int err = write_inode(inode, wbc);
...@@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags) ...@@ -1148,12 +1164,11 @@ void __mark_inode_dirty(struct inode *inode, int flags)
} }
/* /*
* make sure that changes are seen by all cpus before we test i_state * Paired with smp_mb() in __writeback_single_inode() for the
* -- mikulas * following lockless i_state test. See there for details.
*/ */
smp_mb(); smp_mb();
/* avoid the locking if we can */
if ((inode->i_state & flags) == flags) if ((inode->i_state & flags) == flags)
return; return;
......
...@@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, ...@@ -443,6 +443,11 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int,
extern void bio_set_pages_dirty(struct bio *bio); extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio);
void generic_start_io_acct(int rw, unsigned long sectors,
struct hd_struct *part);
void generic_end_io_acct(int rw, struct hd_struct *part,
unsigned long start_time);
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
#endif #endif
......
...@@ -79,7 +79,13 @@ struct blk_mq_tag_set { ...@@ -79,7 +79,13 @@ struct blk_mq_tag_set {
struct list_head tag_list; struct list_head tag_list;
}; };
typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, struct request *, bool); struct blk_mq_queue_data {
struct request *rq;
struct list_head *list;
bool last;
};
typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int); typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
...@@ -140,6 +146,7 @@ enum { ...@@ -140,6 +146,7 @@ enum {
BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_TAG_SHARED = 1 << 1,
BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_SG_MERGE = 1 << 2,
BLK_MQ_F_SYSFS_UP = 1 << 3, BLK_MQ_F_SYSFS_UP = 1 << 3,
BLK_MQ_F_DEFER_ISSUE = 1 << 4,
BLK_MQ_S_STOPPED = 0, BLK_MQ_S_STOPPED = 0,
BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_TAG_ACTIVE = 1,
...@@ -162,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); ...@@ -162,6 +169,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_insert_request(struct request *, bool, bool, bool); void blk_mq_insert_request(struct request *, bool, bool, bool);
void blk_mq_run_queues(struct request_queue *q, bool async); void blk_mq_run_queues(struct request_queue *q, bool async);
void blk_mq_free_request(struct request *rq); void blk_mq_free_request(struct request *rq);
void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *, struct request *rq);
bool blk_mq_can_queue(struct blk_mq_hw_ctx *); bool blk_mq_can_queue(struct blk_mq_hw_ctx *);
struct request *blk_mq_alloc_request(struct request_queue *q, int rw, struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
gfp_t gfp, bool reserved); gfp_t gfp, bool reserved);
......
...@@ -1184,7 +1184,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); ...@@ -1184,7 +1184,6 @@ extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
enum blk_default_limits { enum blk_default_limits {
BLK_MAX_SEGMENTS = 128, BLK_MAX_SEGMENTS = 128,
BLK_SAFE_MAX_SECTORS = 255, BLK_SAFE_MAX_SECTORS = 255,
BLK_DEF_MAX_SECTORS = 1024,
BLK_MAX_SEGMENT_SIZE = 65536, BLK_MAX_SEGMENT_SIZE = 65536,
BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL,
}; };
......
...@@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q) ...@@ -1477,9 +1477,6 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (atomic_dec_and_test(&blk_probes_ref)) if (atomic_dec_and_test(&blk_probes_ref))
blk_unregister_tracepoints(); blk_unregister_tracepoints();
spin_lock_irq(&running_trace_lock);
list_del(&bt->running_list);
spin_unlock_irq(&running_trace_lock);
blk_trace_free(bt); blk_trace_free(bt);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment