Commit e0d07225 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block layer fixes from Jens Axboe:
 "A collection of fixes for this merge window, either fixes for existing
  issues, or parts that were waiting for acks to come in. This pull
  request contains:

   - Allocation of nvme queues on the right node from Shaohua.

     This was ready long before the merge window, but waiting on an ack
     from Bjorn on the PCI bit. Now that we have that, the three patches
     can go in.

   - Two fixes for blk-mq-sched with nvmeof, which uses hctx specific
     request allocations. This caused an oops. One part from Sagi, one
     part from Omar.

   - A loop partition scan deadlock fix from Omar, fixing a regression
     in this merge window.

   - A three-patch series from Keith, closing up a hole on clearing out
     requests on shutdown/resume.

   - A stable fix for nbd from Josef, fixing a leak of sockets.

   - Two fixes for a regression in this window from Jan, fixing a
     problem with one of his earlier patches dealing with queue vs bdi
     life times.

   - A fix for a regression with virtio-blk, causing an IO stall if
     scheduling is used. From me.

   - A fix for an io context lock ordering problem. From me"

* 'for-linus' of git://git.kernel.dk/linux-block:
  block: Move bdi_unregister() to del_gendisk()
  blk-mq: ensure that bd->last is always set correctly
  block: don't call ioc_exit_icq() with the queue lock held for blk-mq
  block: Initialize bd_bdi on inode initialization
  loop: fix LO_FLAGS_PARTSCAN hang
  nvme: Complete all stuck requests
  blk-mq: Provide freeze queue timeout
  blk-mq: Export blk_mq_freeze_queue_wait
  nbd: stop leaking sockets
  blk-mq: move update of tags->rqs to __blk_mq_alloc_request()
  blk-mq: kill blk_mq_set_alloc_data()
  blk-mq: make blk_mq_alloc_request_hctx() allocate a scheduler request
  blk-mq-sched: Allocate sched reserved tags as specified in the original queue tagset
  nvme: allocate nvme_queue in correct node
  PCI: add an API to get node from vector
  blk-mq: allocate blk_mq_tags and requests in correct node
parents 1827adb1 165a5e22
...@@ -578,7 +578,6 @@ void blk_cleanup_queue(struct request_queue *q) ...@@ -578,7 +578,6 @@ void blk_cleanup_queue(struct request_queue *q)
q->queue_lock = &q->__queue_lock; q->queue_lock = &q->__queue_lock;
spin_unlock_irq(lock); spin_unlock_irq(lock);
bdi_unregister(q->backing_dev_info);
put_disk_devt(q->disk_devt); put_disk_devt(q->disk_devt);
/* @q is and will stay empty, shutdown and put */ /* @q is and will stay empty, shutdown and put */
......
...@@ -37,8 +37,8 @@ static void icq_free_icq_rcu(struct rcu_head *head) ...@@ -37,8 +37,8 @@ static void icq_free_icq_rcu(struct rcu_head *head)
} }
/* /*
* Exit an icq. Called with both ioc and q locked for sq, only ioc locked for * Exit an icq. Called with ioc locked for blk-mq, and with both ioc
* mq. * and queue locked for legacy.
*/ */
static void ioc_exit_icq(struct io_cq *icq) static void ioc_exit_icq(struct io_cq *icq)
{ {
...@@ -55,7 +55,10 @@ static void ioc_exit_icq(struct io_cq *icq) ...@@ -55,7 +55,10 @@ static void ioc_exit_icq(struct io_cq *icq)
icq->flags |= ICQ_EXITED; icq->flags |= ICQ_EXITED;
} }
/* Release an icq. Called with both ioc and q locked. */ /*
* Release an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
*/
static void ioc_destroy_icq(struct io_cq *icq) static void ioc_destroy_icq(struct io_cq *icq)
{ {
struct io_context *ioc = icq->ioc; struct io_context *ioc = icq->ioc;
...@@ -63,7 +66,6 @@ static void ioc_destroy_icq(struct io_cq *icq) ...@@ -63,7 +66,6 @@ static void ioc_destroy_icq(struct io_cq *icq)
struct elevator_type *et = q->elevator->type; struct elevator_type *et = q->elevator->type;
lockdep_assert_held(&ioc->lock); lockdep_assert_held(&ioc->lock);
lockdep_assert_held(q->queue_lock);
radix_tree_delete(&ioc->icq_tree, icq->q->id); radix_tree_delete(&ioc->icq_tree, icq->q->id);
hlist_del_init(&icq->ioc_node); hlist_del_init(&icq->ioc_node);
...@@ -223,24 +225,40 @@ void exit_io_context(struct task_struct *task) ...@@ -223,24 +225,40 @@ void exit_io_context(struct task_struct *task)
put_io_context_active(ioc); put_io_context_active(ioc);
} }
static void __ioc_clear_queue(struct list_head *icq_list)
{
unsigned long flags;
while (!list_empty(icq_list)) {
struct io_cq *icq = list_entry(icq_list->next,
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock_irqsave(&ioc->lock, flags);
ioc_destroy_icq(icq);
spin_unlock_irqrestore(&ioc->lock, flags);
}
}
/** /**
* ioc_clear_queue - break any ioc association with the specified queue * ioc_clear_queue - break any ioc association with the specified queue
* @q: request_queue being cleared * @q: request_queue being cleared
* *
* Walk @q->icq_list and exit all io_cq's. Must be called with @q locked. * Walk @q->icq_list and exit all io_cq's.
*/ */
void ioc_clear_queue(struct request_queue *q) void ioc_clear_queue(struct request_queue *q)
{ {
lockdep_assert_held(q->queue_lock); LIST_HEAD(icq_list);
while (!list_empty(&q->icq_list)) { spin_lock_irq(q->queue_lock);
struct io_cq *icq = list_entry(q->icq_list.next, list_splice_init(&q->icq_list, &icq_list);
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock(&ioc->lock); if (q->mq_ops) {
ioc_destroy_icq(icq); spin_unlock_irq(q->queue_lock);
spin_unlock(&ioc->lock); __ioc_clear_queue(&icq_list);
} else {
__ioc_clear_queue(&icq_list);
spin_unlock_irq(q->queue_lock);
} }
} }
......
...@@ -110,15 +110,14 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, ...@@ -110,15 +110,14 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
struct blk_mq_alloc_data *data) struct blk_mq_alloc_data *data)
{ {
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct request *rq; struct request *rq;
blk_queue_enter_live(q); blk_queue_enter_live(q);
ctx = blk_mq_get_ctx(q); data->q = q;
hctx = blk_mq_map_queue(q, ctx->cpu); if (likely(!data->ctx))
data->ctx = blk_mq_get_ctx(q);
blk_mq_set_alloc_data(data, q, data->flags, ctx, hctx); if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
if (e) { if (e) {
data->flags |= BLK_MQ_REQ_INTERNAL; data->flags |= BLK_MQ_REQ_INTERNAL;
...@@ -135,8 +134,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q, ...@@ -135,8 +134,6 @@ struct request *blk_mq_sched_get_request(struct request_queue *q,
rq = __blk_mq_alloc_request(data, op); rq = __blk_mq_alloc_request(data, op);
} else { } else {
rq = __blk_mq_alloc_request(data, op); rq = __blk_mq_alloc_request(data, op);
if (rq)
data->hctx->tags->rqs[rq->tag] = rq;
} }
if (rq) { if (rq) {
...@@ -454,7 +451,8 @@ int blk_mq_sched_setup(struct request_queue *q) ...@@ -454,7 +451,8 @@ int blk_mq_sched_setup(struct request_queue *q)
*/ */
ret = 0; ret = 0;
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
hctx->sched_tags = blk_mq_alloc_rq_map(set, i, q->nr_requests, 0); hctx->sched_tags = blk_mq_alloc_rq_map(set, i,
q->nr_requests, set->reserved_tags);
if (!hctx->sched_tags) { if (!hctx->sched_tags) {
ret = -ENOMEM; ret = -ENOMEM;
break; break;
......
...@@ -181,7 +181,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) ...@@ -181,7 +181,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags, void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
struct blk_mq_ctx *ctx, unsigned int tag) struct blk_mq_ctx *ctx, unsigned int tag)
{ {
if (tag >= tags->nr_reserved_tags) { if (!blk_mq_tag_is_reserved(tags, tag)) {
const int real_tag = tag - tags->nr_reserved_tags; const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags); BUG_ON(real_tag >= tags->nr_tags);
......
...@@ -85,4 +85,10 @@ static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, ...@@ -85,4 +85,10 @@ static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
hctx->tags->rqs[tag] = rq; hctx->tags->rqs[tag] = rq;
} }
static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
unsigned int tag)
{
return tag < tags->nr_reserved_tags;
}
#endif #endif
...@@ -77,10 +77,20 @@ void blk_mq_freeze_queue_start(struct request_queue *q) ...@@ -77,10 +77,20 @@ void blk_mq_freeze_queue_start(struct request_queue *q)
} }
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start);
static void blk_mq_freeze_queue_wait(struct request_queue *q) void blk_mq_freeze_queue_wait(struct request_queue *q)
{ {
wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter));
} }
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout)
{
return wait_event_timeout(q->mq_freeze_wq,
percpu_ref_is_zero(&q->q_usage_counter),
timeout);
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout);
/* /*
* Guarantee no request is in use, so we can change any data structure of * Guarantee no request is in use, so we can change any data structure of
...@@ -236,6 +246,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, ...@@ -236,6 +246,7 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
} }
rq->tag = tag; rq->tag = tag;
rq->internal_tag = -1; rq->internal_tag = -1;
data->hctx->tags->rqs[rq->tag] = rq;
} }
blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
...@@ -275,10 +286,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request); ...@@ -275,10 +286,9 @@ EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
unsigned int flags, unsigned int hctx_idx) unsigned int flags, unsigned int hctx_idx)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_alloc_data alloc_data = { .flags = flags };
struct blk_mq_ctx *ctx;
struct request *rq; struct request *rq;
struct blk_mq_alloc_data alloc_data; unsigned int cpu;
int ret; int ret;
/* /*
...@@ -301,25 +311,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, ...@@ -301,25 +311,23 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
* Check if the hardware context is actually mapped to anything. * Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue. * If not tell the caller that it should skip this queue.
*/ */
hctx = q->queue_hw_ctx[hctx_idx]; alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
if (!blk_mq_hw_queue_mapped(hctx)) { if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
ret = -EXDEV; blk_queue_exit(q);
goto out_queue_exit; return ERR_PTR(-EXDEV);
}
ctx = __blk_mq_get_ctx(q, cpumask_first(hctx->cpumask));
blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
rq = __blk_mq_alloc_request(&alloc_data, rw);
if (!rq) {
ret = -EWOULDBLOCK;
goto out_queue_exit;
} }
cpu = cpumask_first(alloc_data.hctx->cpumask);
alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
return rq; rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
out_queue_exit: blk_mq_put_ctx(alloc_data.ctx);
blk_queue_exit(q); blk_queue_exit(q);
return ERR_PTR(ret);
if (!rq)
return ERR_PTR(-EWOULDBLOCK);
return rq;
} }
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
...@@ -854,6 +862,9 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, ...@@ -854,6 +862,9 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
return true; return true;
} }
if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
data.flags |= BLK_MQ_REQ_RESERVED;
rq->tag = blk_mq_get_tag(&data); rq->tag = blk_mq_get_tag(&data);
if (rq->tag >= 0) { if (rq->tag >= 0) {
if (blk_mq_tag_busy(data.hctx)) { if (blk_mq_tag_busy(data.hctx)) {
...@@ -867,12 +878,9 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, ...@@ -867,12 +878,9 @@ bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx,
return false; return false;
} }
static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq) struct request *rq)
{ {
if (rq->tag == -1 || rq->internal_tag == -1)
return;
blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = -1; rq->tag = -1;
...@@ -882,6 +890,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, ...@@ -882,6 +890,26 @@ static void blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
} }
} }
static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
if (rq->tag == -1 || rq->internal_tag == -1)
return;
__blk_mq_put_driver_tag(hctx, rq);
}
static void blk_mq_put_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx;
if (rq->tag == -1 || rq->internal_tag == -1)
return;
hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu);
__blk_mq_put_driver_tag(hctx, rq);
}
/* /*
* If we fail getting a driver tag because all the driver tags are already * If we fail getting a driver tag because all the driver tags are already
* assigned and on the dispatch list, BUT the first entry does not have a * assigned and on the dispatch list, BUT the first entry does not have a
...@@ -991,7 +1019,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) ...@@ -991,7 +1019,19 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
bd.rq = rq; bd.rq = rq;
bd.list = dptr; bd.list = dptr;
bd.last = list_empty(list);
/*
* Flag last if we have no more requests, or if we have more
* but can't assign a driver tag to it.
*/
if (list_empty(list))
bd.last = true;
else {
struct request *nxt;
nxt = list_first_entry(list, struct request, queuelist);
bd.last = !blk_mq_get_driver_tag(nxt, NULL, false);
}
ret = q->mq_ops->queue_rq(hctx, &bd); ret = q->mq_ops->queue_rq(hctx, &bd);
switch (ret) { switch (ret) {
...@@ -999,7 +1039,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) ...@@ -999,7 +1039,7 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
queued++; queued++;
break; break;
case BLK_MQ_RQ_QUEUE_BUSY: case BLK_MQ_RQ_QUEUE_BUSY:
blk_mq_put_driver_tag(hctx, rq); blk_mq_put_driver_tag_hctx(hctx, rq);
list_add(&rq->queuelist, list); list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq); __blk_mq_requeue_request(rq);
break; break;
...@@ -1029,6 +1069,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list) ...@@ -1029,6 +1069,13 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
* that is where we will continue on next queue run. * that is where we will continue on next queue run.
*/ */
if (!list_empty(list)) { if (!list_empty(list)) {
/*
* If we got a driver tag for the next request already,
* free it again.
*/
rq = list_first_entry(list, struct request, queuelist);
blk_mq_put_driver_tag(rq);
spin_lock(&hctx->lock); spin_lock(&hctx->lock);
list_splice_init(list, &hctx->dispatch); list_splice_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock); spin_unlock(&hctx->lock);
...@@ -1715,16 +1762,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, ...@@ -1715,16 +1762,20 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int reserved_tags) unsigned int reserved_tags)
{ {
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
int node;
tags = blk_mq_init_tags(nr_tags, reserved_tags, node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
set->numa_node, if (node == NUMA_NO_NODE)
node = set->numa_node;
tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags) if (!tags)
return NULL; return NULL;
tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node); node);
if (!tags->rqs) { if (!tags->rqs) {
blk_mq_free_tags(tags); blk_mq_free_tags(tags);
return NULL; return NULL;
...@@ -1732,7 +1783,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, ...@@ -1732,7 +1783,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node); node);
if (!tags->static_rqs) { if (!tags->static_rqs) {
kfree(tags->rqs); kfree(tags->rqs);
blk_mq_free_tags(tags); blk_mq_free_tags(tags);
...@@ -1752,6 +1803,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, ...@@ -1752,6 +1803,11 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
{ {
unsigned int i, j, entries_per_page, max_order = 4; unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left; size_t rq_size, left;
int node;
node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
INIT_LIST_HEAD(&tags->page_list); INIT_LIST_HEAD(&tags->page_list);
...@@ -1773,7 +1829,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, ...@@ -1773,7 +1829,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
this_order--; this_order--;
do { do {
page = alloc_pages_node(set->numa_node, page = alloc_pages_node(node,
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO,
this_order); this_order);
if (page) if (page)
...@@ -1806,7 +1862,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, ...@@ -1806,7 +1862,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
if (set->ops->init_request) { if (set->ops->init_request) {
if (set->ops->init_request(set->driver_data, if (set->ops->init_request(set->driver_data,
rq, hctx_idx, i, rq, hctx_idx, i,
set->numa_node)) { node)) {
tags->static_rqs[i] = NULL; tags->static_rqs[i] = NULL;
goto fail; goto fail;
} }
......
...@@ -146,16 +146,6 @@ struct blk_mq_alloc_data { ...@@ -146,16 +146,6 @@ struct blk_mq_alloc_data {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
}; };
static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data,
struct request_queue *q, unsigned int flags,
struct blk_mq_ctx *ctx, struct blk_mq_hw_ctx *hctx)
{
data->q = q;
data->flags = flags;
data->ctx = ctx;
data->hctx = hctx;
}
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{ {
if (data->flags & BLK_MQ_REQ_INTERNAL) if (data->flags & BLK_MQ_REQ_INTERNAL)
......
...@@ -815,9 +815,7 @@ static void blk_release_queue(struct kobject *kobj) ...@@ -815,9 +815,7 @@ static void blk_release_queue(struct kobject *kobj)
blkcg_exit_queue(q); blkcg_exit_queue(q);
if (q->elevator) { if (q->elevator) {
spin_lock_irq(q->queue_lock);
ioc_clear_queue(q); ioc_clear_queue(q);
spin_unlock_irq(q->queue_lock);
elevator_exit(q->elevator); elevator_exit(q->elevator);
} }
......
...@@ -983,9 +983,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) ...@@ -983,9 +983,7 @@ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
if (old_registered) if (old_registered)
elv_unregister_queue(q); elv_unregister_queue(q);
spin_lock_irq(q->queue_lock);
ioc_clear_queue(q); ioc_clear_queue(q);
spin_unlock_irq(q->queue_lock);
} }
/* allocate, init and register new elevator */ /* allocate, init and register new elevator */
......
...@@ -681,6 +681,11 @@ void del_gendisk(struct gendisk *disk) ...@@ -681,6 +681,11 @@ void del_gendisk(struct gendisk *disk)
disk->flags &= ~GENHD_FL_UP; disk->flags &= ~GENHD_FL_UP;
sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
/*
* Unregister bdi before releasing device numbers (as they can get
* reused and we'd get clashes in sysfs).
*/
bdi_unregister(disk->queue->backing_dev_info);
blk_unregister_queue(disk); blk_unregister_queue(disk);
blk_unregister_region(disk_devt(disk), disk->minors); blk_unregister_region(disk_devt(disk), disk->minors);
......
...@@ -1142,13 +1142,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) ...@@ -1142,13 +1142,6 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
(info->lo_flags & LO_FLAGS_AUTOCLEAR)) (info->lo_flags & LO_FLAGS_AUTOCLEAR))
lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
if ((info->lo_flags & LO_FLAGS_PARTSCAN) &&
!(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
lo->lo_flags |= LO_FLAGS_PARTSCAN;
lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
loop_reread_partitions(lo, lo->lo_device);
}
lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
lo->lo_init[0] = info->lo_init[0]; lo->lo_init[0] = info->lo_init[0];
lo->lo_init[1] = info->lo_init[1]; lo->lo_init[1] = info->lo_init[1];
...@@ -1163,6 +1156,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) ...@@ -1163,6 +1156,14 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
exit: exit:
blk_mq_unfreeze_queue(lo->lo_queue); blk_mq_unfreeze_queue(lo->lo_queue);
if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) &&
!(lo->lo_flags & LO_FLAGS_PARTSCAN)) {
lo->lo_flags |= LO_FLAGS_PARTSCAN;
lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN;
loop_reread_partitions(lo, lo->lo_device);
}
return err; return err;
} }
......
...@@ -675,8 +675,10 @@ static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev) ...@@ -675,8 +675,10 @@ static int nbd_clear_sock(struct nbd_device *nbd, struct block_device *bdev)
nbd->num_connections) { nbd->num_connections) {
int i; int i;
for (i = 0; i < nbd->num_connections; i++) for (i = 0; i < nbd->num_connections; i++) {
sockfd_put(nbd->socks[i]->sock);
kfree(nbd->socks[i]); kfree(nbd->socks[i]);
}
kfree(nbd->socks); kfree(nbd->socks);
nbd->socks = NULL; nbd->socks = NULL;
nbd->num_connections = 0; nbd->num_connections = 0;
......
...@@ -2344,6 +2344,53 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) ...@@ -2344,6 +2344,53 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
} }
EXPORT_SYMBOL_GPL(nvme_kill_queues); EXPORT_SYMBOL_GPL(nvme_kill_queues);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_unfreeze_queue(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list) {
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
if (timeout <= 0)
break;
}
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_freeze_queue_wait(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry(ns, &ctrl->namespaces, list)
blk_mq_freeze_queue_start(ns->queue);
mutex_unlock(&ctrl->namespaces_mutex);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
void nvme_stop_queues(struct nvme_ctrl *ctrl) void nvme_stop_queues(struct nvme_ctrl *ctrl)
{ {
struct nvme_ns *ns; struct nvme_ns *ns;
......
...@@ -294,6 +294,10 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl); ...@@ -294,6 +294,10 @@ void nvme_queue_async_events(struct nvme_ctrl *ctrl);
void nvme_stop_queues(struct nvme_ctrl *ctrl); void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl); void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl); void nvme_kill_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze(struct nvme_ctrl *ctrl);
void nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout);
void nvme_start_freeze(struct nvme_ctrl *ctrl);
#define NVME_QID_ANY -1 #define NVME_QID_ANY -1
struct request *nvme_alloc_request(struct request_queue *q, struct request *nvme_alloc_request(struct request_queue *q,
......
...@@ -1038,9 +1038,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, ...@@ -1038,9 +1038,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
} }
static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
int depth) int depth, int node)
{ {
struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
node);
if (!nvmeq) if (!nvmeq)
return NULL; return NULL;
...@@ -1217,7 +1218,8 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) ...@@ -1217,7 +1218,8 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev)
nvmeq = dev->queues[0]; nvmeq = dev->queues[0];
if (!nvmeq) { if (!nvmeq) {
nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
dev_to_node(dev->dev));
if (!nvmeq) if (!nvmeq)
return -ENOMEM; return -ENOMEM;
} }
...@@ -1309,7 +1311,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev) ...@@ -1309,7 +1311,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
int ret = 0; int ret = 0;
for (i = dev->queue_count; i <= dev->max_qid; i++) { for (i = dev->queue_count; i <= dev->max_qid; i++) {
if (!nvme_alloc_queue(dev, i, dev->q_depth)) { /* vector == qid - 1, match nvme_create_queue */
if (!nvme_alloc_queue(dev, i, dev->q_depth,
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
ret = -ENOMEM; ret = -ENOMEM;
break; break;
} }
...@@ -1671,21 +1675,34 @@ static void nvme_pci_disable(struct nvme_dev *dev) ...@@ -1671,21 +1675,34 @@ static void nvme_pci_disable(struct nvme_dev *dev)
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{ {
int i, queues; int i, queues;
u32 csts = -1; bool dead = true;
struct pci_dev *pdev = to_pci_dev(dev->dev);
del_timer_sync(&dev->watchdog_timer); del_timer_sync(&dev->watchdog_timer);
mutex_lock(&dev->shutdown_lock); mutex_lock(&dev->shutdown_lock);
if (pci_is_enabled(to_pci_dev(dev->dev))) { if (pci_is_enabled(pdev)) {
nvme_stop_queues(&dev->ctrl); u32 csts = readl(dev->bar + NVME_REG_CSTS);
csts = readl(dev->bar + NVME_REG_CSTS);
if (dev->ctrl.state == NVME_CTRL_LIVE)
nvme_start_freeze(&dev->ctrl);
dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
pdev->error_state != pci_channel_io_normal);
} }
/*
* Give the controller a chance to complete all entered requests if
* doing a safe shutdown.
*/
if (!dead && shutdown)
nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
nvme_stop_queues(&dev->ctrl);
queues = dev->online_queues - 1; queues = dev->online_queues - 1;
for (i = dev->queue_count - 1; i > 0; i--) for (i = dev->queue_count - 1; i > 0; i--)
nvme_suspend_queue(dev->queues[i]); nvme_suspend_queue(dev->queues[i]);
if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { if (dead) {
/* A device might become IO incapable very soon during /* A device might become IO incapable very soon during
* probe, before the admin queue is configured. Thus, * probe, before the admin queue is configured. Thus,
* queue_count can be 0 here. * queue_count can be 0 here.
...@@ -1700,6 +1717,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) ...@@ -1700,6 +1717,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
/*
* The driver will not be starting up queues again if shutting down so
* must flush all entered requests to their failed completion to avoid
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown)
nvme_start_queues(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock); mutex_unlock(&dev->shutdown_lock);
} }
...@@ -1822,7 +1847,9 @@ static void nvme_reset_work(struct work_struct *work) ...@@ -1822,7 +1847,9 @@ static void nvme_reset_work(struct work_struct *work)
nvme_remove_namespaces(&dev->ctrl); nvme_remove_namespaces(&dev->ctrl);
} else { } else {
nvme_start_queues(&dev->ctrl); nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
nvme_dev_add(dev); nvme_dev_add(dev);
nvme_unfreeze(&dev->ctrl);
} }
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
......
...@@ -1298,6 +1298,22 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr) ...@@ -1298,6 +1298,22 @@ const struct cpumask *pci_irq_get_affinity(struct pci_dev *dev, int nr)
} }
EXPORT_SYMBOL(pci_irq_get_affinity); EXPORT_SYMBOL(pci_irq_get_affinity);
/**
* pci_irq_get_node - return the numa node of a particular msi vector
* @pdev: PCI device to operate on
* @vec: device-relative interrupt vector index (0-based).
*/
int pci_irq_get_node(struct pci_dev *pdev, int vec)
{
const struct cpumask *mask;
mask = pci_irq_get_affinity(pdev, vec);
if (mask)
return local_memory_node(cpu_to_node(cpumask_first(mask)));
return dev_to_node(&pdev->dev);
}
EXPORT_SYMBOL(pci_irq_get_node);
struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc) struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
{ {
return to_pci_dev(desc->dev); return to_pci_dev(desc->dev);
......
...@@ -870,6 +870,7 @@ static void init_once(void *foo) ...@@ -870,6 +870,7 @@ static void init_once(void *foo)
#ifdef CONFIG_SYSFS #ifdef CONFIG_SYSFS
INIT_LIST_HEAD(&bdev->bd_holder_disks); INIT_LIST_HEAD(&bdev->bd_holder_disks);
#endif #endif
bdev->bd_bdi = &noop_backing_dev_info;
inode_init_once(&ei->vfs_inode); inode_init_once(&ei->vfs_inode);
/* Initialize mutex for freeze. */ /* Initialize mutex for freeze. */
mutex_init(&bdev->bd_fsfreeze_mutex); mutex_init(&bdev->bd_fsfreeze_mutex);
...@@ -884,8 +885,10 @@ static void bdev_evict_inode(struct inode *inode) ...@@ -884,8 +885,10 @@ static void bdev_evict_inode(struct inode *inode)
spin_lock(&bdev_lock); spin_lock(&bdev_lock);
list_del_init(&bdev->bd_list); list_del_init(&bdev->bd_list);
spin_unlock(&bdev_lock); spin_unlock(&bdev_lock);
if (bdev->bd_bdi != &noop_backing_dev_info) if (bdev->bd_bdi != &noop_backing_dev_info) {
bdi_put(bdev->bd_bdi); bdi_put(bdev->bd_bdi);
bdev->bd_bdi = &noop_backing_dev_info;
}
} }
static const struct super_operations bdev_sops = { static const struct super_operations bdev_sops = {
...@@ -988,7 +991,6 @@ struct block_device *bdget(dev_t dev) ...@@ -988,7 +991,6 @@ struct block_device *bdget(dev_t dev)
bdev->bd_contains = NULL; bdev->bd_contains = NULL;
bdev->bd_super = NULL; bdev->bd_super = NULL;
bdev->bd_inode = inode; bdev->bd_inode = inode;
bdev->bd_bdi = &noop_backing_dev_info;
bdev->bd_block_size = i_blocksize(inode); bdev->bd_block_size = i_blocksize(inode);
bdev->bd_part_count = 0; bdev->bd_part_count = 0;
bdev->bd_invalidated = 0; bdev->bd_invalidated = 0;
......
...@@ -245,6 +245,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, ...@@ -245,6 +245,9 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_unfreeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q);
void blk_mq_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_start(struct request_queue *q);
void blk_mq_freeze_queue_wait(struct request_queue *q);
int blk_mq_freeze_queue_wait_timeout(struct request_queue *q,
unsigned long timeout);
int blk_mq_reinit_tagset(struct blk_mq_tag_set *set); int blk_mq_reinit_tagset(struct blk_mq_tag_set *set);
int blk_mq_map_queues(struct blk_mq_tag_set *set); int blk_mq_map_queues(struct blk_mq_tag_set *set);
......
...@@ -1323,6 +1323,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs, ...@@ -1323,6 +1323,7 @@ int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
void pci_free_irq_vectors(struct pci_dev *dev); void pci_free_irq_vectors(struct pci_dev *dev);
int pci_irq_vector(struct pci_dev *dev, unsigned int nr); int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec); const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, int vec);
int pci_irq_get_node(struct pci_dev *pdev, int vec);
#else #else
static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
...@@ -1370,6 +1371,11 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev, ...@@ -1370,6 +1371,11 @@ static inline const struct cpumask *pci_irq_get_affinity(struct pci_dev *pdev,
{ {
return cpu_possible_mask; return cpu_possible_mask;
} }
static inline int pci_irq_get_node(struct pci_dev *pdev, int vec)
{
return first_online_node;
}
#endif #endif
static inline int static inline int
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment