Commit 1521dc24 authored by Jens Axboe's avatar Jens Axboe

Merge tag 'nvme-6.10-2024-05-29' of git://git.infradead.org/nvme into block-6.10

Pull NVMe fixes from Keith:

"nvme fixes for Linux 6.10

 - Removing unused fields (Kanchan)
 - Large folio offsets support (Kundan)
 - Multipath NUMA node initialiazation fix (Nilay)
 - Multipath IO stats accounting fixes (Keith)
 - Circular lockdep fix (Keith)
 - Target race condition fix (Sagi)
 - Target memory leak fix (Sagi)"

* tag 'nvme-6.10-2024-05-29' of git://git.infradead.org/nvme:
  nvmet: fix a possible leak when destroy a ctrl during qp establishment
  nvme: use srcu for iterating namespace list
  nvme: adjust multiples of NVME_CTRL_PAGE_SIZE in offset
  nvme: remove sgs and sws
  nvmet: fix ns enable/disable possible hang
  nvme-multipath: fix io accounting on failover
  nvme: fix multipath batched completion accounting
  nvme-multipath: find NUMA path only for online numa-node
parents 74d4ce92 c758b77d
......@@ -414,7 +414,15 @@ static inline void nvme_end_req_zoned(struct request *req)
}
}
static inline void nvme_end_req(struct request *req)
static inline void __nvme_end_req(struct request *req)
{
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
}
void nvme_end_req(struct request *req)
{
blk_status_t status = nvme_error_status(nvme_req(req)->status);
......@@ -424,10 +432,7 @@ static inline void nvme_end_req(struct request *req)
else
nvme_log_error(req);
}
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
if (req->cmd_flags & REQ_NVME_MPATH)
nvme_mpath_end_request(req);
__nvme_end_req(req);
blk_mq_end_request(req, status);
}
......@@ -476,7 +481,7 @@ void nvme_complete_batch_req(struct request *req)
{
trace_nvme_complete_rq(req);
nvme_cleanup_cmd(req);
nvme_end_req_zoned(req);
__nvme_end_req(req);
}
EXPORT_SYMBOL_GPL(nvme_complete_batch_req);
......@@ -673,7 +678,7 @@ static void nvme_free_ns(struct kref *kref)
kfree(ns);
}
static inline bool nvme_get_ns(struct nvme_ns *ns)
bool nvme_get_ns(struct nvme_ns *ns)
{
return kref_get_unless_zero(&ns->kref);
}
......@@ -3679,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
{
struct nvme_ns *ns, *ret = NULL;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (ns->head->ns_id == nsid) {
if (!nvme_get_ns(ns))
continue;
......@@ -3691,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
if (ns->head->ns_id > nsid)
break;
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
......@@ -3705,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
if (tmp->head->ns_id < ns->head->ns_id) {
list_add(&ns->list, &tmp->list);
list_add_rcu(&ns->list, &tmp->list);
return;
}
}
......@@ -3771,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
down_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
/*
* Ensure that no namespaces are added to the ctrl list after the queues
* are frozen, thereby avoiding a deadlock between scan and reset.
*/
if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
up_write(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->namespaces_lock);
goto out_unlink_ns;
}
nvme_ns_add_to_ctrl_list(ns);
up_write(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
nvme_get_ctrl(ctrl);
if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
......@@ -3804,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
out_cleanup_ns_from_list:
nvme_put_ctrl(ctrl);
down_write(&ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
list_del_rcu(&ns->list);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
list_del_rcu(&ns->siblings);
......@@ -3856,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
nvme_cdev_del(&ns->cdev, &ns->cdev_device);
del_gendisk(ns->disk);
down_write(&ns->ctrl->namespaces_rwsem);
list_del_init(&ns->list);
up_write(&ns->ctrl->namespaces_rwsem);
mutex_lock(&ns->ctrl->namespaces_lock);
list_del_rcu(&ns->list);
mutex_unlock(&ns->ctrl->namespaces_lock);
synchronize_srcu(&ns->ctrl->srcu);
if (last_path)
nvme_mpath_shutdown_disk(ns->head);
......@@ -3948,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
struct nvme_ns *ns, *next;
LIST_HEAD(rm_list);
down_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
if (ns->head->ns_id > nsid)
list_move_tail(&ns->list, &rm_list);
list_splice_init_rcu(&ns->list, &rm_list,
synchronize_rcu);
}
up_write(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
list_for_each_entry_safe(ns, next, &rm_list, list)
nvme_ns_remove(ns);
}
static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
......@@ -4127,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
down_write(&ctrl->namespaces_rwsem);
list_splice_init(&ctrl->namespaces, &ns_list);
up_write(&ctrl->namespaces_rwsem);
mutex_lock(&ctrl->namespaces_lock);
list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
mutex_unlock(&ctrl->namespaces_lock);
synchronize_srcu(&ctrl->srcu);
list_for_each_entry_safe(ns, next, &ns_list, list)
nvme_ns_remove(ns);
......@@ -4577,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
key_put(ctrl->tls_key);
nvme_free_cels(ctrl);
nvme_mpath_uninit(ctrl);
cleanup_srcu_struct(&ctrl->srcu);
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
......@@ -4609,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->passthru_err_log_enabled = false;
clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
spin_lock_init(&ctrl->lock);
mutex_init(&ctrl->namespaces_lock);
ret = init_srcu_struct(&ctrl->srcu);
if (ret)
return ret;
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
xa_init(&ctrl->cels);
init_rwsem(&ctrl->namespaces_rwsem);
ctrl->dev = dev;
ctrl->ops = ops;
ctrl->quirks = quirks;
......@@ -4692,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
out:
if (ctrl->discard_page)
__free_page(ctrl->discard_page);
cleanup_srcu_struct(&ctrl->srcu);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
......@@ -4700,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mark_disk_dead(ns->disk);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mq_unfreeze_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
}
EXPORT_SYMBOL_GPL(nvme_unfreeze);
......@@ -4723,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
if (timeout <= 0)
break;
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return timeout;
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
......@@ -4738,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
void nvme_wait_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_mq_freeze_queue_wait(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_wait_freeze);
void nvme_start_freeze(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_freeze_queue_start(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
......@@ -4797,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
blk_sync_queue(ns->queue);
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
......
......@@ -789,15 +789,15 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
bool open_for_write)
{
struct nvme_ns *ns;
int ret;
int ret, srcu_idx;
down_read(&ctrl->namespaces_rwsem);
srcu_idx = srcu_read_lock(&ctrl->srcu);
if (list_empty(&ctrl->namespaces)) {
ret = -ENOTTY;
goto out_unlock;
}
ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
dev_warn(ctrl->device,
"NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n");
......@@ -807,15 +807,18 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
dev_warn(ctrl->device,
"using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n");
kref_get(&ns->kref);
up_read(&ctrl->namespaces_rwsem);
if (!nvme_get_ns(ns)) {
ret = -ENXIO;
goto out_unlock;
}
srcu_read_unlock(&ctrl->srcu, srcu_idx);
ret = nvme_user_cmd(ctrl, ns, argp, 0, open_for_write);
nvme_put_ns(ns);
return ret;
out_unlock:
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return ret;
}
......
......@@ -118,7 +118,8 @@ void nvme_failover_req(struct request *req)
blk_steal_bios(&ns->head->requeue_list, req);
spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0);
nvme_req(req)->status = 0;
nvme_end_req(req);
kblockd_schedule_work(&ns->head->requeue_work);
}
......@@ -150,16 +151,17 @@ void nvme_mpath_end_request(struct request *rq)
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
if (!ns->head->disk)
continue;
kblockd_schedule_work(&ns->head->requeue_work);
if (nvme_ctrl_state(ns->ctrl) == NVME_CTRL_LIVE)
disk_uevent(ns->head->disk, KOBJ_CHANGE);
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
static const char *nvme_ana_state_names[] = {
......@@ -193,13 +195,14 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns)
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
int srcu_idx;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
nvme_mpath_clear_current_path(ns);
kblockd_schedule_work(&ns->head->requeue_work);
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
}
void nvme_mpath_revalidate_paths(struct nvme_ns *ns)
......@@ -595,7 +598,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
int node, srcu_idx;
srcu_idx = srcu_read_lock(&head->srcu);
for_each_node(node)
for_each_online_node(node)
__nvme_find_path(head, node);
srcu_read_unlock(&head->srcu, srcu_idx);
}
......@@ -680,6 +683,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
int srcu_idx;
dev_dbg(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
......@@ -691,8 +695,8 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
srcu_idx = srcu_read_lock(&ctrl->srcu);
list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
unsigned nsid;
again:
nsid = le32_to_cpu(desc->nsids[n]);
......@@ -705,7 +709,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (ns->head->ns_id > nsid)
goto again;
}
up_read(&ctrl->namespaces_rwsem);
srcu_read_unlock(&ctrl->srcu, srcu_idx);
return 0;
}
......
......@@ -282,7 +282,8 @@ struct nvme_ctrl {
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
struct rw_semaphore namespaces_rwsem;
struct mutex namespaces_lock;
struct srcu_struct srcu;
struct device ctrl_device;
struct device *device; /* char device */
#ifdef CONFIG_NVME_HWMON
......@@ -471,8 +472,6 @@ struct nvme_ns_head {
u8 pi_type;
u8 pi_offset;
u8 guard_type;
u16 sgs;
u32 sws;
#ifdef CONFIG_BLK_DEV_ZONED
u64 zsze;
#endif
......@@ -767,6 +766,7 @@ static inline bool nvme_state_terminal(struct nvme_ctrl *ctrl)
}
}
void nvme_end_req(struct request *req);
void nvme_complete_rq(struct request *req);
void nvme_complete_batch_req(struct request *req);
......@@ -1161,6 +1161,7 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, struct nvme_ns *ns, u32 effects,
struct nvme_command *cmd, int status);
struct nvme_ctrl *nvme_ctrl_from_file(struct file *file);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
bool nvme_get_ns(struct nvme_ns *ns);
void nvme_put_ns(struct nvme_ns *ns);
static inline bool nvme_multi_css(struct nvme_ctrl *ctrl)
......
......@@ -778,7 +778,8 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
if ((bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
......
......@@ -676,10 +676,18 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item,
if (kstrtobool(page, &enable))
return -EINVAL;
/*
* take a global nvmet_config_sem because the disable routine has a
* window where it releases the subsys-lock, giving a chance to
* a parallel enable to concurrently execute causing the disable to
* have a misaccounting of the ns percpu_ref.
*/
down_write(&nvmet_config_sem);
if (enable)
ret = nvmet_ns_enable(ns);
else
nvmet_ns_disable(ns);
up_write(&nvmet_config_sem);
return ret ? ret : count;
}
......
......@@ -818,6 +818,15 @@ void nvmet_sq_destroy(struct nvmet_sq *sq)
percpu_ref_exit(&sq->ref);
nvmet_auth_sq_free(sq);
/*
* we must reference the ctrl again after waiting for inflight IO
* to complete. Because admin connect may have sneaked in after we
* store sq->ctrl locally, but before we killed the percpu_ref. the
* admin connect allocates and assigns sq->ctrl, which now needs a
* final ref put, as this ctrl is going away.
*/
ctrl = sq->ctrl;
if (ctrl) {
/*
* The teardown flow may take some time, and the host may not
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment