Commit 0d0b660f authored by Christoph Hellwig's avatar Christoph Hellwig

nvme: add ANA support

Add support for Asynchronous Namespace Access as specified in NVMe 1.3
TP 4004.  With ANA each namespace attached to a controller belongs to an
ANA group that describes the characteristics of accessing the namespaces
through this controller.  In the optimized and non-optimized states
namespaces can be accessed regularly, although in a multi-pathing
environment we should always prefer to access a namespace through a
controller where an optimized relationship exists.  Namespaces in
Inaccessible, Permanent-Loss or Change state for a given controller
should not be accessed.

The states are updated through reading the ANA log page, which is read
once during controller initialization, whenever the ANA change notice
AEN is received, or when one of the ANA specific status codes that
signal a state change is received on a command.

The ANA state is kept in the nvme_ns structure, which makes the checks in
the fast path very simple.  Updating the ANA state when reading the log
page is also very simple, the only downside is that finding the initial
ANA state when scanning for namespaces is a bit cumbersome.

The gendisk for a ns_head is only registered once a live path for it
exists.  Without that the kernel would hang during partition scanning.

Includes fixes and improvements from Hannes Reinecke.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
Reviewed-by: default avatarKeith Busch <keith.busch@intel.com>
Reviewed-by: default avatarMartin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: default avatarJohannes Thumshirn <jthumshirn@suse.de>
parent 8decf5d5
...@@ -1035,18 +1035,18 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) ...@@ -1035,18 +1035,18 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
EXPORT_SYMBOL_GPL(nvme_set_queue_count); EXPORT_SYMBOL_GPL(nvme_set_queue_count);
#define NVME_AEN_SUPPORTED \ #define NVME_AEN_SUPPORTED \
(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT) (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE)
static void nvme_enable_aen(struct nvme_ctrl *ctrl) static void nvme_enable_aen(struct nvme_ctrl *ctrl)
{ {
u32 result; u32 supported = ctrl->oaes & NVME_AEN_SUPPORTED, result;
int status; int status;
status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, status = nvme_set_features(ctrl, NVME_FEAT_ASYNC_EVENT, supported, NULL,
ctrl->oaes & NVME_AEN_SUPPORTED, NULL, 0, &result); 0, &result);
if (status) if (status)
dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n",
ctrl->oaes & NVME_AEN_SUPPORTED); supported);
} }
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
...@@ -2370,6 +2370,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ...@@ -2370,6 +2370,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
nvme_set_queue_limits(ctrl, ctrl->admin_q); nvme_set_queue_limits(ctrl, ctrl->admin_q);
ctrl->sgls = le32_to_cpu(id->sgls); ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas); ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
if (id->rtd3e) { if (id->rtd3e) {
/* us -> s */ /* us -> s */
...@@ -2429,8 +2430,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) ...@@ -2429,8 +2430,12 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd); ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);
} }
ret = nvme_mpath_init(ctrl, id);
kfree(id); kfree(id);
if (ret < 0)
return ret;
if (ctrl->apst_enabled && !prev_apst_enabled) if (ctrl->apst_enabled && !prev_apst_enabled)
dev_pm_qos_expose_latency_tolerance(ctrl->device); dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled) else if (!ctrl->apst_enabled && prev_apst_enabled)
...@@ -2649,6 +2654,10 @@ static struct attribute *nvme_ns_id_attrs[] = { ...@@ -2649,6 +2654,10 @@ static struct attribute *nvme_ns_id_attrs[] = {
&dev_attr_nguid.attr, &dev_attr_nguid.attr,
&dev_attr_eui.attr, &dev_attr_eui.attr,
&dev_attr_nsid.attr, &dev_attr_nsid.attr,
#ifdef CONFIG_NVME_MULTIPATH
&dev_attr_ana_grpid.attr,
&dev_attr_ana_state.attr,
#endif
NULL, NULL,
}; };
...@@ -2671,6 +2680,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj, ...@@ -2671,6 +2680,14 @@ static umode_t nvme_ns_id_attrs_are_visible(struct kobject *kobj,
if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64))) if (!memchr_inv(ids->eui64, 0, sizeof(ids->eui64)))
return 0; return 0;
} }
#ifdef CONFIG_NVME_MULTIPATH
if (a == &dev_attr_ana_grpid.attr || a == &dev_attr_ana_state.attr) {
if (dev_to_disk(dev)->fops != &nvme_fops) /* per-path attr */
return 0;
if (!nvme_ctrl_use_ana(nvme_get_ns_from_dev(dev)->ctrl))
return 0;
}
#endif
return a->mode; return a->mode;
} }
...@@ -3044,8 +3061,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ...@@ -3044,8 +3061,6 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
nvme_get_ctrl(ctrl); nvme_get_ctrl(ctrl);
kfree(id);
device_add_disk(ctrl->device, ns->disk); device_add_disk(ctrl->device, ns->disk);
if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_id_attr_group)) &nvme_ns_id_attr_group))
...@@ -3055,8 +3070,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ...@@ -3055,8 +3070,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
pr_warn("%s: failed to register lightnvm sysfs group for identification\n", pr_warn("%s: failed to register lightnvm sysfs group for identification\n",
ns->disk->disk_name); ns->disk->disk_name);
nvme_mpath_add_disk(ns->head); nvme_mpath_add_disk(ns, id);
nvme_fault_inject_init(ns); nvme_fault_inject_init(ns);
kfree(id);
return; return;
out_unlink_ns: out_unlink_ns:
mutex_lock(&ctrl->subsys->lock); mutex_lock(&ctrl->subsys->lock);
...@@ -3364,6 +3381,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) ...@@ -3364,6 +3381,13 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result)
case NVME_AER_NOTICE_FW_ACT_STARTING: case NVME_AER_NOTICE_FW_ACT_STARTING:
queue_work(nvme_wq, &ctrl->fw_act_work); queue_work(nvme_wq, &ctrl->fw_act_work);
break; break;
#ifdef CONFIG_NVME_MULTIPATH
case NVME_AER_NOTICE_ANA:
if (!ctrl->ana_log_buf)
break;
queue_work(nvme_wq, &ctrl->ana_work);
break;
#endif
default: default:
dev_warn(ctrl->device, "async event result %08x\n", result); dev_warn(ctrl->device, "async event result %08x\n", result);
} }
...@@ -3396,6 +3420,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); ...@@ -3396,6 +3420,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl) void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
{ {
nvme_mpath_stop(ctrl);
nvme_stop_keep_alive(ctrl); nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work); flush_work(&ctrl->async_event_work);
flush_work(&ctrl->scan_work); flush_work(&ctrl->scan_work);
...@@ -3433,6 +3458,7 @@ static void nvme_free_ctrl(struct device *dev) ...@@ -3433,6 +3458,7 @@ static void nvme_free_ctrl(struct device *dev)
ida_simple_remove(&nvme_instance_ida, ctrl->instance); ida_simple_remove(&nvme_instance_ida, ctrl->instance);
kfree(ctrl->effects); kfree(ctrl->effects);
nvme_mpath_uninit(ctrl);
if (subsys) { if (subsys) {
mutex_lock(&subsys->lock); mutex_lock(&subsys->lock);
......
/* /*
* Copyright (c) 2017 Christoph Hellwig. * Copyright (c) 2017-2018 Christoph Hellwig.
* *
* This program is free software; you can redistribute it and/or modify it * This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License, * under the terms and conditions of the GNU General Public License,
...@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444); ...@@ -20,6 +20,11 @@ module_param(multipath, bool, 0444);
MODULE_PARM_DESC(multipath, MODULE_PARM_DESC(multipath,
"turn on native support for multiple controllers per subsystem"); "turn on native support for multiple controllers per subsystem");
inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
return multipath && (ctrl->subsys->cmic & (1 << 3));
}
/* /*
* If multipathing is enabled we need to always use the subsystem instance * If multipathing is enabled we need to always use the subsystem instance
* number for numbering our devices to avoid conflicts between subsystems that * number for numbering our devices to avoid conflicts between subsystems that
...@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, ...@@ -45,6 +50,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
void nvme_failover_req(struct request *req) void nvme_failover_req(struct request *req)
{ {
struct nvme_ns *ns = req->q->queuedata; struct nvme_ns *ns = req->q->queuedata;
u16 status = nvme_req(req)->status;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&ns->head->requeue_lock, flags); spin_lock_irqsave(&ns->head->requeue_lock, flags);
...@@ -52,7 +58,34 @@ void nvme_failover_req(struct request *req) ...@@ -52,7 +58,34 @@ void nvme_failover_req(struct request *req)
spin_unlock_irqrestore(&ns->head->requeue_lock, flags); spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
blk_mq_end_request(req, 0); blk_mq_end_request(req, 0);
switch (status & 0x7ff) {
case NVME_SC_ANA_TRANSITION:
case NVME_SC_ANA_INACCESSIBLE:
case NVME_SC_ANA_PERSISTENT_LOSS:
/*
* If we got back an ANA error we know the controller is alive,
* but not ready to serve this namespaces. The spec suggests
* we should update our general state here, but due to the fact
* that the admin and I/O queues are not serialized that is
* fundamentally racy. So instead just clear the current path,
* mark the the path as pending and kick of a re-read of the ANA
* log page ASAP.
*/
nvme_mpath_clear_current_path(ns);
if (ns->ctrl->ana_log_buf) {
set_bit(NVME_NS_ANA_PENDING, &ns->flags);
queue_work(nvme_wq, &ns->ctrl->ana_work);
}
break;
default:
/*
* Reset the controller for any non-ANA error as we don't know
* what caused the error.
*/
nvme_reset_ctrl(ns->ctrl); nvme_reset_ctrl(ns->ctrl);
break;
}
kblockd_schedule_work(&ns->head->requeue_work); kblockd_schedule_work(&ns->head->requeue_work);
} }
...@@ -68,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) ...@@ -68,25 +101,51 @@ void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
up_read(&ctrl->namespaces_rwsem); up_read(&ctrl->namespaces_rwsem);
} }
static const char *nvme_ana_state_names[] = {
[0] = "invalid state",
[NVME_ANA_OPTIMIZED] = "optimized",
[NVME_ANA_NONOPTIMIZED] = "non-optimized",
[NVME_ANA_INACCESSIBLE] = "inaccessible",
[NVME_ANA_PERSISTENT_LOSS] = "persistent-loss",
[NVME_ANA_CHANGE] = "change",
};
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head) static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head)
{ {
struct nvme_ns *ns; struct nvme_ns *ns, *fallback = NULL;
list_for_each_entry_rcu(ns, &head->list, siblings) { list_for_each_entry_rcu(ns, &head->list, siblings) {
if (ns->ctrl->state == NVME_CTRL_LIVE) { if (ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags))
continue;
switch (ns->ana_state) {
case NVME_ANA_OPTIMIZED:
rcu_assign_pointer(head->current_path, ns); rcu_assign_pointer(head->current_path, ns);
return ns; return ns;
case NVME_ANA_NONOPTIMIZED:
fallback = ns;
break;
default:
break;
} }
} }
return NULL; if (fallback)
rcu_assign_pointer(head->current_path, fallback);
return fallback;
}
static inline bool nvme_path_is_optimized(struct nvme_ns *ns)
{
return ns->ctrl->state == NVME_CTRL_LIVE &&
ns->ana_state == NVME_ANA_OPTIMIZED;
} }
inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head) inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
{ {
struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu); struct nvme_ns *ns = srcu_dereference(head->current_path, &head->srcu);
if (unlikely(!ns || ns->ctrl->state != NVME_CTRL_LIVE)) if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head); ns = __nvme_find_path(head);
return ns; return ns;
} }
...@@ -135,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc) ...@@ -135,7 +194,7 @@ static bool nvme_ns_head_poll(struct request_queue *q, blk_qc_t qc)
srcu_idx = srcu_read_lock(&head->srcu); srcu_idx = srcu_read_lock(&head->srcu);
ns = srcu_dereference(head->current_path, &head->srcu); ns = srcu_dereference(head->current_path, &head->srcu);
if (likely(ns && ns->ctrl->state == NVME_CTRL_LIVE)) if (likely(ns && nvme_path_is_optimized(ns)))
found = ns->queue->poll_fn(q, qc); found = ns->queue->poll_fn(q, qc);
srcu_read_unlock(&head->srcu, srcu_idx); srcu_read_unlock(&head->srcu, srcu_idx);
return found; return found;
...@@ -169,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ...@@ -169,6 +228,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
struct request_queue *q; struct request_queue *q;
bool vwc = false; bool vwc = false;
mutex_init(&head->lock);
bio_list_init(&head->requeue_list); bio_list_init(&head->requeue_list);
spin_lock_init(&head->requeue_lock); spin_lock_init(&head->requeue_lock);
INIT_WORK(&head->requeue_work, nvme_requeue_work); INIT_WORK(&head->requeue_work, nvme_requeue_work);
...@@ -213,29 +273,232 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ...@@ -213,29 +273,232 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
return -ENOMEM; return -ENOMEM;
} }
void nvme_mpath_add_disk(struct nvme_ns_head *head) static void nvme_mpath_set_live(struct nvme_ns *ns)
{ {
struct nvme_ns_head *head = ns->head;
lockdep_assert_held(&ns->head->lock);
if (!head->disk) if (!head->disk)
return; return;
mutex_lock(&head->subsys->lock);
if (!(head->disk->flags & GENHD_FL_UP)) { if (!(head->disk->flags & GENHD_FL_UP)) {
device_add_disk(&head->subsys->dev, head->disk); device_add_disk(&head->subsys->dev, head->disk);
if (sysfs_create_group(&disk_to_dev(head->disk)->kobj, if (sysfs_create_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group)) &nvme_ns_id_attr_group))
pr_warn("%s: failed to create sysfs group for identification\n", dev_warn(&head->subsys->dev,
head->disk->disk_name); "failed to create id group.\n");
}
kblockd_schedule_work(&ns->head->requeue_work);
}
static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
int (*cb)(struct nvme_ctrl *ctrl, struct nvme_ana_group_desc *,
void *))
{
void *base = ctrl->ana_log_buf;
size_t offset = sizeof(struct nvme_ana_rsp_hdr);
int error, i;
lockdep_assert_held(&ctrl->ana_lock);
for (i = 0; i < le16_to_cpu(ctrl->ana_log_buf->ngrps); i++) {
struct nvme_ana_group_desc *desc = base + offset;
u32 nr_nsids = le32_to_cpu(desc->nnsids);
size_t nsid_buf_size = nr_nsids * sizeof(__le32);
if (WARN_ON_ONCE(desc->grpid == 0))
return -EINVAL;
if (WARN_ON_ONCE(le32_to_cpu(desc->grpid) > ctrl->anagrpmax))
return -EINVAL;
if (WARN_ON_ONCE(desc->state == 0))
return -EINVAL;
if (WARN_ON_ONCE(desc->state > NVME_ANA_CHANGE))
return -EINVAL;
offset += sizeof(*desc);
if (WARN_ON_ONCE(offset > ctrl->ana_log_size - nsid_buf_size))
return -EINVAL;
error = cb(ctrl, desc, data);
if (error)
return error;
offset += nsid_buf_size;
if (WARN_ON_ONCE(offset > ctrl->ana_log_size - sizeof(*desc)))
return -EINVAL;
}
return 0;
}
static inline bool nvme_state_is_live(enum nvme_ana_state state)
{
return state == NVME_ANA_OPTIMIZED || state == NVME_ANA_NONOPTIMIZED;
}
static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
struct nvme_ns *ns)
{
enum nvme_ana_state old;
mutex_lock(&ns->head->lock);
old = ns->ana_state;
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
if (nvme_state_is_live(ns->ana_state) && !nvme_state_is_live(old))
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
u32 nr_nsids = le32_to_cpu(desc->nnsids), n = 0;
unsigned *nr_change_groups = data;
struct nvme_ns *ns;
dev_info(ctrl->device, "ANA group %d: %s.\n",
le32_to_cpu(desc->grpid),
nvme_ana_state_names[desc->state]);
if (desc->state == NVME_ANA_CHANGE)
(*nr_change_groups)++;
if (!nr_nsids)
return 0;
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
if (ns->head->ns_id != le32_to_cpu(desc->nsids[n]))
continue;
nvme_update_ns_ana_state(desc, ns);
if (++n == nr_nsids)
break;
}
up_write(&ctrl->namespaces_rwsem);
WARN_ON_ONCE(n < nr_nsids);
return 0;
}
static int nvme_read_ana_log(struct nvme_ctrl *ctrl, bool groups_only)
{
u32 nr_change_groups = 0;
int error;
mutex_lock(&ctrl->ana_lock);
error = nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_ANA,
groups_only ? NVME_ANA_LOG_RGO : 0,
ctrl->ana_log_buf, ctrl->ana_log_size, 0);
if (error) {
dev_warn(ctrl->device, "Failed to get ANA log: %d\n", error);
goto out_unlock;
}
error = nvme_parse_ana_log(ctrl, &nr_change_groups,
nvme_update_ana_state);
if (error)
goto out_unlock;
/*
* In theory we should have an ANATT timer per group as they might enter
* the change state at different times. But that is a lot of overhead
* just to protect against a target that keeps entering new changes
* states while never finishing previous ones. But we'll still
* eventually time out once all groups are in change state, so this
* isn't a big deal.
*
* We also double the ANATT value to provide some slack for transports
* or AEN processing overhead.
*/
if (nr_change_groups)
mod_timer(&ctrl->anatt_timer, ctrl->anatt * HZ * 2 + jiffies);
else
del_timer_sync(&ctrl->anatt_timer);
out_unlock:
mutex_unlock(&ctrl->ana_lock);
return error;
}
static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
nvme_read_ana_log(ctrl, false);
}
static void nvme_anatt_timeout(struct timer_list *t)
{
struct nvme_ctrl *ctrl = from_timer(ctrl, t, anatt_timer);
dev_info(ctrl->device, "ANATT timeout, resetting controller.\n");
nvme_reset_ctrl(ctrl);
}
void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
if (!nvme_ctrl_use_ana(ctrl))
return;
del_timer_sync(&ctrl->anatt_timer);
cancel_work_sync(&ctrl->ana_work);
}
static ssize_t ana_grpid_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
return sprintf(buf, "%d\n", nvme_get_ns_from_dev(dev)->ana_grpid);
}
DEVICE_ATTR_RO(ana_grpid);
static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct nvme_ns *ns = nvme_get_ns_from_dev(dev);
return sprintf(buf, "%s\n", nvme_ana_state_names[ns->ana_state]);
}
DEVICE_ATTR_RO(ana_state);
static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
struct nvme_ns *ns = data;
if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
nvme_update_ns_ana_state(desc, ns);
return -ENXIO; /* just break out of the loop */
}
return 0;
}
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
mutex_lock(&ns->ctrl->ana_lock);
ns->ana_grpid = le32_to_cpu(id->anagrpid);
nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
mutex_unlock(&ns->ctrl->ana_lock);
} else {
mutex_lock(&ns->head->lock);
ns->ana_state = NVME_ANA_OPTIMIZED;
nvme_mpath_set_live(ns);
mutex_unlock(&ns->head->lock);
} }
mutex_unlock(&head->subsys->lock);
} }
void nvme_mpath_remove_disk(struct nvme_ns_head *head) void nvme_mpath_remove_disk(struct nvme_ns_head *head)
{ {
if (!head->disk) if (!head->disk)
return; return;
if (head->disk->flags & GENHD_FL_UP) {
sysfs_remove_group(&disk_to_dev(head->disk)->kobj, sysfs_remove_group(&disk_to_dev(head->disk)->kobj,
&nvme_ns_id_attr_group); &nvme_ns_id_attr_group);
del_gendisk(head->disk); del_gendisk(head->disk);
}
blk_set_queue_dying(head->disk->queue); blk_set_queue_dying(head->disk->queue);
/* make sure all pending bios are cleaned up */ /* make sure all pending bios are cleaned up */
kblockd_schedule_work(&head->requeue_work); kblockd_schedule_work(&head->requeue_work);
...@@ -243,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) ...@@ -243,3 +506,52 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
blk_cleanup_queue(head->disk->queue); blk_cleanup_queue(head->disk->queue);
put_disk(head->disk); put_disk(head->disk);
} }
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
int error;
if (!nvme_ctrl_use_ana(ctrl))
return 0;
ctrl->anacap = id->anacap;
ctrl->anatt = id->anatt;
ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
ctrl->anagrpmax = le32_to_cpu(id->anagrpmax);
mutex_init(&ctrl->ana_lock);
timer_setup(&ctrl->anatt_timer, nvme_anatt_timeout, 0);
ctrl->ana_log_size = sizeof(struct nvme_ana_rsp_hdr) +
ctrl->nanagrpid * sizeof(struct nvme_ana_group_desc);
if (!(ctrl->anacap & (1 << 6)))
ctrl->ana_log_size += ctrl->max_namespaces * sizeof(__le32);
if (ctrl->ana_log_size > ctrl->max_hw_sectors << SECTOR_SHIFT) {
dev_err(ctrl->device,
"ANA log page size (%zd) larger than MDTS (%d).\n",
ctrl->ana_log_size,
ctrl->max_hw_sectors << SECTOR_SHIFT);
dev_err(ctrl->device, "disabling ANA support.\n");
return 0;
}
INIT_WORK(&ctrl->ana_work, nvme_ana_work);
ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
if (!ctrl->ana_log_buf)
goto out;
error = nvme_read_ana_log(ctrl, true);
if (error)
goto out_free_ana_log_buf;
return 0;
out_free_ana_log_buf:
kfree(ctrl->ana_log_buf);
out:
return -ENOMEM;
}
void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
kfree(ctrl->ana_log_buf);
}
...@@ -183,6 +183,7 @@ struct nvme_ctrl { ...@@ -183,6 +183,7 @@ struct nvme_ctrl {
u16 oacs; u16 oacs;
u16 nssa; u16 nssa;
u16 nr_streams; u16 nr_streams;
u32 max_namespaces;
atomic_t abort_limit; atomic_t abort_limit;
u8 vwc; u8 vwc;
u32 vs; u32 vs;
...@@ -205,6 +206,19 @@ struct nvme_ctrl { ...@@ -205,6 +206,19 @@ struct nvme_ctrl {
struct work_struct fw_act_work; struct work_struct fw_act_work;
unsigned long events; unsigned long events;
#ifdef CONFIG_NVME_MULTIPATH
/* asymmetric namespace access: */
u8 anacap;
u8 anatt;
u32 anagrpmax;
u32 nanagrpid;
struct mutex ana_lock;
struct nvme_ana_rsp_hdr *ana_log_buf;
size_t ana_log_size;
struct timer_list anatt_timer;
struct work_struct ana_work;
#endif
/* Power saving configuration */ /* Power saving configuration */
u64 ps_max_latency_us; u64 ps_max_latency_us;
bool apst_enabled; bool apst_enabled;
...@@ -269,6 +283,7 @@ struct nvme_ns_head { ...@@ -269,6 +283,7 @@ struct nvme_ns_head {
struct bio_list requeue_list; struct bio_list requeue_list;
spinlock_t requeue_lock; spinlock_t requeue_lock;
struct work_struct requeue_work; struct work_struct requeue_work;
struct mutex lock;
#endif #endif
struct list_head list; struct list_head list;
struct srcu_struct srcu; struct srcu_struct srcu;
...@@ -295,6 +310,10 @@ struct nvme_ns { ...@@ -295,6 +310,10 @@ struct nvme_ns {
struct nvme_ctrl *ctrl; struct nvme_ctrl *ctrl;
struct request_queue *queue; struct request_queue *queue;
struct gendisk *disk; struct gendisk *disk;
#ifdef CONFIG_NVME_MULTIPATH
enum nvme_ana_state ana_state;
u32 ana_grpid;
#endif
struct list_head siblings; struct list_head siblings;
struct nvm_dev *ndev; struct nvm_dev *ndev;
struct kref kref; struct kref kref;
...@@ -309,6 +328,7 @@ struct nvme_ns { ...@@ -309,6 +328,7 @@ struct nvme_ns {
unsigned long flags; unsigned long flags;
#define NVME_NS_REMOVING 0 #define NVME_NS_REMOVING 0
#define NVME_NS_DEAD 1 #define NVME_NS_DEAD 1
#define NVME_NS_ANA_PENDING 2
u16 noiob; u16 noiob;
#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
...@@ -450,13 +470,17 @@ extern const struct attribute_group nvme_ns_id_attr_group; ...@@ -450,13 +470,17 @@ extern const struct attribute_group nvme_ns_id_attr_group;
extern const struct block_device_operations nvme_ns_head_ops; extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH #ifdef CONFIG_NVME_MULTIPATH
bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns, void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
struct nvme_ctrl *ctrl, int *flags); struct nvme_ctrl *ctrl, int *flags);
void nvme_failover_req(struct request *req); void nvme_failover_req(struct request *req);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl); void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head); int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head); void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
void nvme_mpath_remove_disk(struct nvme_ns_head *head); void nvme_mpath_remove_disk(struct nvme_ns_head *head);
int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id);
void nvme_mpath_uninit(struct nvme_ctrl *ctrl);
void nvme_mpath_stop(struct nvme_ctrl *ctrl);
static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
{ {
...@@ -475,7 +499,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) ...@@ -475,7 +499,14 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work); kblockd_schedule_work(&head->requeue_work);
} }
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
#else #else
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
return false;
}
/* /*
* Without the multipath code enabled, multiple controller per subsystems are * Without the multipath code enabled, multiple controller per subsystems are
* visible as devices and thus we cannot use the subsystem instance. * visible as devices and thus we cannot use the subsystem instance.
...@@ -497,7 +528,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, ...@@ -497,7 +528,8 @@ static inline int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,
{ {
return 0; return 0;
} }
static inline void nvme_mpath_add_disk(struct nvme_ns_head *head) static inline void nvme_mpath_add_disk(struct nvme_ns *ns,
struct nvme_id_ns *id)
{ {
} }
static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head) static inline void nvme_mpath_remove_disk(struct nvme_ns_head *head)
...@@ -509,6 +541,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns) ...@@ -509,6 +541,17 @@ static inline void nvme_mpath_clear_current_path(struct nvme_ns *ns)
static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
{ {
} }
static inline int nvme_mpath_init(struct nvme_ctrl *ctrl,
struct nvme_id_ctrl *id)
{
return 0;
}
static inline void nvme_mpath_uninit(struct nvme_ctrl *ctrl)
{
}
static inline void nvme_mpath_stop(struct nvme_ctrl *ctrl)
{
}
#endif /* CONFIG_NVME_MULTIPATH */ #endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM #ifdef CONFIG_NVM
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment