Commit a9e8e18a authored by Jens Axboe's avatar Jens Axboe

Merge branch 'nvme-5.9' of git://git.infradead.org/nvme into for-5.9/drivers

Pull NVMe updates from Christoph.

* 'nvme-5.9' of git://git.infradead.org/nvme: (30 commits)
  nvme-loop: remove extra variable in create ctrl
  nvme-loop: set ctrl state connecting after init
  nvme-multipath: do not fall back to __nvme_find_path() for non-optimized paths
  nvme-multipath: fix logic for non-optimized paths
  nvme-rdma: fix controller reset hang during traffic
  nvme-tcp: fix controller reset hang during traffic
  nvmet: introduce the passthru Kconfig option
  nvmet: introduce the passthru configfs interface
  nvmet: Add passthru enable/disable helpers
  nvmet: add passthru code to process commands
  nvme: export nvme_find_get_ns() and nvme_put_ns()
  nvme: introduce nvme_ctrl_get_by_path()
  nvme: introduce nvme_execute_passthru_rq to call nvme_passthru_[start|end]()
  nvme: create helper function to obtain command effects
  nvme: clear any SGL flags in passthru commands
  nvmet-fc: remove redundant del_work_active flag
  nvmet-fc: check successful reference in nvmet_fc_find_target_assoc
  nvme-fc: set max_segments to lldd max value
  nvme-fc: drop a duplicated word in a comment
  nvme-hwmon: log the controller device name
  ...
parents c5be1f2c b6cec06d
......@@ -45,6 +45,9 @@ static const guid_t prp_guids[] = {
/* Thunderbolt GUID for WAKE_SUPPORTED: 6c501103-c189-4296-ba72-9bf5a26ebe5d */
GUID_INIT(0x6c501103, 0xc189, 0x4296,
0xba, 0x72, 0x9b, 0xf5, 0xa2, 0x6e, 0xbe, 0x5d),
/* Storage device needs D3 GUID: 5025030f-842f-4ab4-a561-99a5189762d0 */
GUID_INIT(0x5025030f, 0x842f, 0x4ab4,
0xa5, 0x61, 0x99, 0xa5, 0x18, 0x97, 0x62, 0xd0),
};
/* ACPI _DSD data subnodes GUID: dbb8e3e6-5886-4ba6-8795-1319f52a966b */
......
This diff is collapsed.
......@@ -547,7 +547,7 @@ static struct nvmf_transport_ops *nvmf_lookup_transport(
blk_status_t nvmf_fail_nonready_command(struct nvme_ctrl *ctrl,
struct request *rq)
{
if (ctrl->state != NVME_CTRL_DELETING &&
if (ctrl->state != NVME_CTRL_DELETING_NOIO &&
ctrl->state != NVME_CTRL_DEAD &&
!blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
return BLK_STS_RESOURCE;
......
......@@ -182,7 +182,8 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
static inline bool nvmf_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
bool queue_live)
{
if (likely(ctrl->state == NVME_CTRL_LIVE))
if (likely(ctrl->state == NVME_CTRL_LIVE ||
ctrl->state == NVME_CTRL_DELETING))
return true;
return __nvmf_check_ready(ctrl, rq, queue_live);
}
......
......@@ -826,6 +826,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
break;
case NVME_CTRL_DELETING:
case NVME_CTRL_DELETING_NOIO:
default:
/* no action to take - let it delete */
break;
......@@ -3001,8 +3002,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
if (ret)
goto out_disconnect_admin_queue;
ctrl->ctrl.max_hw_sectors =
(ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9);
ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments;
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
(ilog2(SZ_4K) - 9);
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
......
......@@ -241,7 +241,8 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl)
err = nvme_hwmon_get_smart_log(data);
if (err) {
dev_warn(dev, "Failed to read smart log (error %d)\n", err);
dev_warn(ctrl->device,
"Failed to read smart log (error %d)\n", err);
devm_kfree(dev, data);
return;
}
......
......@@ -167,9 +167,18 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
static bool nvme_path_is_disabled(struct nvme_ns *ns)
{
return ns->ctrl->state != NVME_CTRL_LIVE ||
test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
test_bit(NVME_NS_REMOVING, &ns->flags);
/*
* We don't treat NVME_CTRL_DELETING as a disabled path as I/O should
* still be able to complete assuming that the controller is connected.
* Otherwise it will fail immediately and return to the requeue list.
*/
if (ns->ctrl->state != NVME_CTRL_LIVE &&
ns->ctrl->state != NVME_CTRL_DELETING)
return true;
if (test_bit(NVME_NS_ANA_PENDING, &ns->flags) ||
test_bit(NVME_NS_REMOVING, &ns->flags))
return true;
return false;
}
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
......@@ -246,6 +255,12 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
fallback = ns;
}
/* No optimized path found, re-check the current path */
if (!nvme_path_is_disabled(old) &&
old->ana_state == NVME_ANA_OPTIMIZED) {
found = old;
goto out;
}
if (!fallback)
return NULL;
found = fallback;
......@@ -266,10 +281,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
ns = nvme_round_robin_path(head, node, ns);
if (unlikely(!ns || !nvme_path_is_optimized(ns)))
ns = __nvme_find_path(head, node);
if (unlikely(!ns))
return __nvme_find_path(head, node);
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
return nvme_round_robin_path(head, node, ns);
if (unlikely(!nvme_path_is_optimized(ns)))
return __nvme_find_path(head, node);
return ns;
}
......@@ -563,6 +581,9 @@ static void nvme_ana_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl = container_of(work, struct nvme_ctrl, ana_work);
if (ctrl->state != NVME_CTRL_LIVE)
return;
nvme_read_ana_log(ctrl);
}
......
......@@ -37,6 +37,14 @@ extern unsigned int admin_timeout;
#define NVME_INLINE_METADATA_SG_CNT 1
#endif
/*
* Default to a 4K page size, with the intention to update this
* path in the future to accommodate architectures with differing
* kernel and IO page sizes.
*/
#define NVME_CTRL_PAGE_SHIFT 12
#define NVME_CTRL_PAGE_SIZE (1 << NVME_CTRL_PAGE_SHIFT)
extern struct workqueue_struct *nvme_wq;
extern struct workqueue_struct *nvme_reset_wq;
extern struct workqueue_struct *nvme_delete_wq;
......@@ -173,12 +181,32 @@ static inline u16 nvme_req_qid(struct request *req)
*/
#define NVME_QUIRK_DELAY_AMOUNT 2300
/*
* enum nvme_ctrl_state: Controller state
*
* @NVME_CTRL_NEW: New controller just allocated, initial state
* @NVME_CTRL_LIVE: Controller is connected and I/O capable
* @NVME_CTRL_RESETTING: Controller is resetting (or scheduled reset)
* @NVME_CTRL_CONNECTING: Controller is disconnected, now connecting the
* transport
* @NVME_CTRL_DELETING: Controller is deleting (or scheduled deletion)
* @NVME_CTRL_DELETING_NOIO: Controller is deleting and I/O is not
* disabled/failed immediately. This state comes
* after all async event processing took place and
* before ns removal and the controller deletion
* progress
* @NVME_CTRL_DEAD: Controller is non-present/unresponsive during
* shutdown or removal. In this case we forcibly
* kill all inflight I/O as they have no chance to
* complete
*/
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
NVME_CTRL_RESETTING,
NVME_CTRL_CONNECTING,
NVME_CTRL_DELETING,
NVME_CTRL_DELETING_NOIO,
NVME_CTRL_DEAD,
};
......@@ -234,7 +262,6 @@ struct nvme_ctrl {
u32 queue_count;
u64 cap;
u32 page_size;
u32 max_hw_sectors;
u32 max_segments;
u32 max_integrity_segments;
......@@ -763,4 +790,11 @@ void nvme_hwmon_init(struct nvme_ctrl *ctrl);
static inline void nvme_hwmon_init(struct nvme_ctrl *ctrl) { }
#endif
u32 nvme_command_effects(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u8 opcode);
void nvme_execute_passthru_rq(struct request *rq);
struct nvme_ctrl *nvme_ctrl_get_by_path(const char *path);
struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid);
void nvme_put_ns(struct nvme_ns *ns);
#endif /* _NVME_H */
......@@ -4,6 +4,7 @@
* Copyright (c) 2011-2014, Intel Corporation.
*/
#include <linux/acpi.h>
#include <linux/aer.h>
#include <linux/async.h>
#include <linux/blkdev.h>
......@@ -94,6 +95,10 @@ static unsigned int poll_queues;
module_param_cb(poll_queues, &io_queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
static bool noacpi;
module_param(noacpi, bool, 0444);
MODULE_PARM_DESC(noacpi, "disable acpi bios quirks");
struct nvme_dev;
struct nvme_queue;
......@@ -346,10 +351,10 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
* as it only leads to a small amount of wasted memory for the lifetime of
* the I/O.
*/
static int nvme_npages(unsigned size, struct nvme_dev *dev)
static int nvme_pci_npages_prp(void)
{
unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size,
dev->ctrl.page_size);
unsigned nprps = DIV_ROUND_UP(NVME_MAX_KB_SZ + NVME_CTRL_PAGE_SIZE,
NVME_CTRL_PAGE_SIZE);
return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8);
}
......@@ -357,22 +362,18 @@ static int nvme_npages(unsigned size, struct nvme_dev *dev)
* Calculates the number of pages needed for the SGL segments. For example a 4k
* page can accommodate 256 SGL descriptors.
*/
static int nvme_pci_npages_sgl(unsigned int num_seg)
static int nvme_pci_npages_sgl(void)
{
return DIV_ROUND_UP(num_seg * sizeof(struct nvme_sgl_desc), PAGE_SIZE);
return DIV_ROUND_UP(NVME_MAX_SEGS * sizeof(struct nvme_sgl_desc),
PAGE_SIZE);
}
static size_t nvme_pci_iod_alloc_size(struct nvme_dev *dev,
unsigned int size, unsigned int nseg, bool use_sgl)
static size_t nvme_pci_iod_alloc_size(void)
{
size_t alloc_size;
if (use_sgl)
alloc_size = sizeof(__le64 *) * nvme_pci_npages_sgl(nseg);
else
alloc_size = sizeof(__le64 *) * nvme_npages(size, dev);
size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
return alloc_size + sizeof(struct scatterlist) * nseg;
return sizeof(__le64 *) * npages +
sizeof(struct scatterlist) * NVME_MAX_SEGS;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
......@@ -515,7 +516,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req)
static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
dma_addr_t dma_addr = iod->first_dma, next_dma_addr;
int i;
......@@ -582,34 +583,33 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
struct scatterlist *sg = iod->sg;
int dma_len = sg_dma_len(sg);
u64 dma_addr = sg_dma_address(sg);
u32 page_size = dev->ctrl.page_size;
int offset = dma_addr & (page_size - 1);
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
void **list = nvme_pci_iod_list(req);
dma_addr_t prp_dma;
int nprps, i;
length -= (page_size - offset);
length -= (NVME_CTRL_PAGE_SIZE - offset);
if (length <= 0) {
iod->first_dma = 0;
goto done;
}
dma_len -= (page_size - offset);
dma_len -= (NVME_CTRL_PAGE_SIZE - offset);
if (dma_len) {
dma_addr += (page_size - offset);
dma_addr += (NVME_CTRL_PAGE_SIZE - offset);
} else {
sg = sg_next(sg);
dma_addr = sg_dma_address(sg);
dma_len = sg_dma_len(sg);
}
if (length <= page_size) {
if (length <= NVME_CTRL_PAGE_SIZE) {
iod->first_dma = dma_addr;
goto done;
}
nprps = DIV_ROUND_UP(length, page_size);
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
if (nprps <= (256 / 8)) {
pool = dev->prp_small_pool;
iod->npages = 0;
......@@ -628,7 +628,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
iod->first_dma = prp_dma;
i = 0;
for (;;) {
if (i == page_size >> 3) {
if (i == NVME_CTRL_PAGE_SIZE >> 3) {
__le64 *old_prp_list = prp_list;
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list)
......@@ -639,9 +639,9 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
i = 1;
}
prp_list[i++] = cpu_to_le64(dma_addr);
dma_len -= page_size;
dma_addr += page_size;
length -= page_size;
dma_len -= NVME_CTRL_PAGE_SIZE;
dma_addr += NVME_CTRL_PAGE_SIZE;
length -= NVME_CTRL_PAGE_SIZE;
if (length <= 0)
break;
if (dma_len > 0)
......@@ -751,8 +751,8 @@ static blk_status_t nvme_setup_prp_simple(struct nvme_dev *dev,
struct bio_vec *bv)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
unsigned int offset = bv->bv_offset & (dev->ctrl.page_size - 1);
unsigned int first_prp_len = dev->ctrl.page_size - offset;
unsigned int offset = bv->bv_offset & (NVME_CTRL_PAGE_SIZE - 1);
unsigned int first_prp_len = NVME_CTRL_PAGE_SIZE - offset;
iod->first_dma = dma_map_bvec(dev->dev, bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->first_dma))
......@@ -794,7 +794,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
if (bv.bv_offset + bv.bv_len <= dev->ctrl.page_size * 2)
if (bv.bv_offset + bv.bv_len <= NVME_CTRL_PAGE_SIZE * 2)
return nvme_setup_prp_simple(dev, req,
&cmnd->rw, &bv);
......@@ -1396,12 +1396,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
{
int q_depth = dev->q_depth;
unsigned q_size_aligned = roundup(q_depth * entry_size,
dev->ctrl.page_size);
NVME_CTRL_PAGE_SIZE);
if (q_size_aligned * nr_io_queues > dev->cmb_size) {
u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues);
mem_per_q = round_down(mem_per_q, dev->ctrl.page_size);
mem_per_q = round_down(mem_per_q, NVME_CTRL_PAGE_SIZE);
q_depth = div_u64(mem_per_q, entry_size);
/*
......@@ -1816,6 +1816,7 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
{
u32 host_mem_size = dev->host_mem_size >> NVME_CTRL_PAGE_SHIFT;
u64 dma_addr = dev->host_mem_descs_dma;
struct nvme_command c;
int ret;
......@@ -1824,8 +1825,7 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
c.features.opcode = nvme_admin_set_features;
c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
c.features.dword11 = cpu_to_le32(bits);
c.features.dword12 = cpu_to_le32(dev->host_mem_size >>
ilog2(dev->ctrl.page_size));
c.features.dword12 = cpu_to_le32(host_mem_size);
c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
......@@ -1845,7 +1845,7 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
for (i = 0; i < dev->nr_host_mem_descs; i++) {
struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
le64_to_cpu(desc->addr),
......@@ -1897,7 +1897,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
break;
descs[i].addr = cpu_to_le64(dma_addr);
descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
i++;
}
......@@ -1913,7 +1913,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
out_free_bufs:
while (--i >= 0) {
size_t size = le32_to_cpu(descs[i].size) * dev->ctrl.page_size;
size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
dma_free_attrs(dev->dev, size, bufs[i],
le64_to_cpu(descs[i].addr),
......@@ -2759,6 +2759,54 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
#ifdef CONFIG_ACPI
static bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
struct acpi_device *adev;
struct pci_dev *root;
acpi_handle handle;
acpi_status status;
u8 val;
/*
* Look for _DSD property specifying that the storage device on the port
* must use D3 to support deep platform power savings during
* suspend-to-idle.
*/
root = pcie_find_root_port(dev);
if (!root)
return false;
adev = ACPI_COMPANION(&root->dev);
if (!adev)
return false;
/*
* The property is defined in the PXSX device for South complex ports
* and in the PEGP device for North complex ports.
*/
status = acpi_get_handle(adev->handle, "PXSX", &handle);
if (ACPI_FAILURE(status)) {
status = acpi_get_handle(adev->handle, "PEGP", &handle);
if (ACPI_FAILURE(status))
return false;
}
if (acpi_bus_get_device(handle, &adev))
return false;
if (fwnode_property_read_u8(acpi_fwnode_handle(adev), "StorageD3Enable",
&val))
return false;
return val == 1;
}
#else
static inline bool nvme_acpi_storage_d3(struct pci_dev *dev)
{
return false;
}
#endif /* CONFIG_ACPI */
static void nvme_async_probe(void *data, async_cookie_t cookie)
{
struct nvme_dev *dev = data;
......@@ -2808,12 +2856,21 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
quirks |= check_vendor_combination_bug(pdev);
if (!noacpi && nvme_acpi_storage_d3(pdev)) {
/*
* Some systems use a bios work around to ask for D3 on
* platforms that support kernel managed suspend.
*/
dev_info(&pdev->dev,
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
/*
* Double check that our mempool alloc size will cover the biggest
* command we support.
*/
alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
NVME_MAX_SEGS, true);
alloc_size = nvme_pci_iod_alloc_size();
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
......
......@@ -96,6 +96,7 @@ struct nvme_rdma_queue {
int cm_error;
struct completion cm_done;
bool pi_support;
int cq_size;
};
struct nvme_rdma_ctrl {
......@@ -275,6 +276,7 @@ static int nvme_rdma_create_qp(struct nvme_rdma_queue *queue, const int factor)
init_attr.recv_cq = queue->ib_cq;
if (queue->pi_support)
init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
init_attr.qp_context = queue;
ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
......@@ -409,6 +411,14 @@ nvme_rdma_find_get_device(struct rdma_cm_id *cm_id)
return NULL;
}
static void nvme_rdma_free_cq(struct nvme_rdma_queue *queue)
{
if (nvme_rdma_poll_queue(queue))
ib_free_cq(queue->ib_cq);
else
ib_cq_pool_put(queue->ib_cq, queue->cq_size);
}
static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
{
struct nvme_rdma_device *dev;
......@@ -430,7 +440,7 @@ static void nvme_rdma_destroy_queue_ib(struct nvme_rdma_queue *queue)
* the destruction of the QP shouldn't use rdma_cm API.
*/
ib_destroy_qp(queue->qp);
ib_free_cq(queue->ib_cq);
nvme_rdma_free_cq(queue);
nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
sizeof(struct nvme_completion), DMA_FROM_DEVICE);
......@@ -450,13 +460,42 @@ static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool pi_support)
return min_t(u32, NVME_RDMA_MAX_SEGMENTS, max_page_list_len - 1);
}
static int nvme_rdma_create_cq(struct ib_device *ibdev,
struct nvme_rdma_queue *queue)
{
int ret, comp_vector, idx = nvme_rdma_queue_idx(queue);
enum ib_poll_context poll_ctx;
/*
* Spread I/O queues completion vectors according their queue index.
* Admin queues can always go on completion vector 0.
*/
comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
/* Polling queues need direct cq polling context */
if (nvme_rdma_poll_queue(queue)) {
poll_ctx = IB_POLL_DIRECT;
queue->ib_cq = ib_alloc_cq(ibdev, queue, queue->cq_size,
comp_vector, poll_ctx);
} else {
poll_ctx = IB_POLL_SOFTIRQ;
queue->ib_cq = ib_cq_pool_get(ibdev, queue->cq_size,
comp_vector, poll_ctx);
}
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
return ret;
}
return 0;
}
static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
{
struct ib_device *ibdev;
const int send_wr_factor = 3; /* MR, SEND, INV */
const int cq_factor = send_wr_factor + 1; /* + RECV */
int comp_vector, idx = nvme_rdma_queue_idx(queue);
enum ib_poll_context poll_ctx;
int ret, pages_per_mr;
queue->device = nvme_rdma_find_get_device(queue->cm_id);
......@@ -467,26 +506,12 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
}
ibdev = queue->device->dev;
/*
* Spread I/O queues completion vectors according their queue index.
* Admin queues can always go on completion vector 0.
*/
comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
/* Polling queues need direct cq polling context */
if (nvme_rdma_poll_queue(queue))
poll_ctx = IB_POLL_DIRECT;
else
poll_ctx = IB_POLL_SOFTIRQ;
/* +1 for ib_stop_cq */
queue->ib_cq = ib_alloc_cq(ibdev, queue,
cq_factor * queue->queue_size + 1,
comp_vector, poll_ctx);
if (IS_ERR(queue->ib_cq)) {
ret = PTR_ERR(queue->ib_cq);
queue->cq_size = cq_factor * queue->queue_size + 1;
ret = nvme_rdma_create_cq(ibdev, queue);
if (ret)
goto out_put_dev;
}
ret = nvme_rdma_create_qp(queue, send_wr_factor);
if (ret)
......@@ -512,7 +537,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize MR pool sized %d for QID %d\n",
queue->queue_size, idx);
queue->queue_size, nvme_rdma_queue_idx(queue));
goto out_destroy_ring;
}
......@@ -523,7 +548,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
if (ret) {
dev_err(queue->ctrl->ctrl.device,
"failed to initialize PI MR pool sized %d for QID %d\n",
queue->queue_size, idx);
queue->queue_size, nvme_rdma_queue_idx(queue));
goto out_destroy_mr_pool;
}
}
......@@ -540,7 +565,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
out_destroy_qp:
rdma_destroy_qp(queue->cm_id);
out_destroy_ib_cq:
ib_free_cq(queue->ib_cq);
nvme_rdma_free_cq(queue);
out_put_dev:
nvme_rdma_dev_put(queue->device);
return ret;
......@@ -942,15 +967,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->ctrl.connect_q);
goto out_free_tag_set;
}
} else {
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
ctrl->ctrl.queue_count - 1);
}
ret = nvme_rdma_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
if (!new) {
nvme_start_queues(&ctrl->ctrl);
nvme_wait_freeze(&ctrl->ctrl);
blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
ctrl->ctrl.queue_count - 1);
nvme_unfreeze(&ctrl->ctrl);
}
return 0;
out_cleanup_connect_q:
......@@ -983,6 +1013,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
if (ctrl->ctrl.tagset) {
......@@ -1077,11 +1108,12 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
if (!changed) {
/*
* state change failure is ok if we're in DELETING state,
* state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
......@@ -1134,8 +1166,9 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING);
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->ctrl.state != NVME_CTRL_DELETING &&
ctrl->ctrl.state != NVME_CTRL_DELETING_NOIO);
return;
}
......@@ -1163,7 +1196,7 @@ static void nvme_rdma_end_request(struct nvme_rdma_request *req)
static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
const char *op)
{
struct nvme_rdma_queue *queue = cq->cq_context;
struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct nvme_rdma_ctrl *ctrl = queue->ctrl;
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
......@@ -1706,7 +1739,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvme_rdma_qe *qe =
container_of(wc->wr_cqe, struct nvme_rdma_qe, cqe);
struct nvme_rdma_queue *queue = cq->cq_context;
struct nvme_rdma_queue *queue = wc->qp->qp_context;
struct ib_device *ibdev = queue->device->dev;
struct nvme_completion *cqe = qe->data;
const size_t len = sizeof(struct nvme_completion);
......
......@@ -1771,15 +1771,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
} else {
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
}
ret = nvme_tcp_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
if (!new) {
nvme_start_queues(ctrl);
nvme_wait_freeze(ctrl);
blk_mq_update_nr_hw_queues(ctrl->tagset,
ctrl->queue_count - 1);
nvme_unfreeze(ctrl);
}
return 0;
out_cleanup_connect_q:
......@@ -1884,6 +1889,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
if (ctrl->tagset) {
......@@ -1950,11 +1956,12 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) {
/*
* state change failure is ok if we're in DELETING state,
* state change failure is ok if we started ctrl delete,
* unless we're during creation of a new controller to
* avoid races with teardown flow.
*/
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
WARN_ON_ONCE(new);
ret = -EINVAL;
goto destroy_io;
......@@ -2010,8 +2017,9 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
blk_mq_unquiesce_queue(ctrl->admin_q);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
......@@ -2046,8 +2054,9 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
nvme_tcp_teardown_ctrl(ctrl, false);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we're in DELETING state */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING);
/* state change failure is ok if we started ctrl delete */
WARN_ON_ONCE(ctrl->state != NVME_CTRL_DELETING &&
ctrl->state != NVME_CTRL_DELETING_NOIO);
return;
}
......
......@@ -16,6 +16,18 @@ config NVME_TARGET
To configure the NVMe target you probably want to use the nvmetcli
tool from http://git.infradead.org/users/hch/nvmetcli.git.
config NVME_TARGET_PASSTHRU
bool "NVMe Target Passthrough support"
depends on NVME_TARGET
depends on NVME_CORE=y || NVME_CORE=NVME_TARGET
help
This enables target side NVMe passthru controller support for the
NVMe Over Fabrics protocol. It allows for hosts to manage and
directly access an actual NVMe controller residing on the target
side, incuding executing Vendor Unique Commands.
If unsure, say N.
config NVME_TARGET_LOOP
tristate "NVMe loopback device support"
depends on NVME_TARGET
......
......@@ -11,6 +11,7 @@ obj-$(CONFIG_NVME_TARGET_TCP) += nvmet-tcp.o
nvmet-y += core.o configfs.o admin-cmd.o fabrics-cmd.o \
discovery.o io-cmd-file.o io-cmd-bdev.o
nvmet-$(CONFIG_NVME_TARGET_PASSTHRU) += passthru.o
nvme-loop-y += loop.o
nvmet-rdma-y += rdma.o
nvmet-fc-y += fc.o
......
......@@ -113,11 +113,10 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
u64 data_units_read = 0, data_units_written = 0;
struct nvmet_ns *ns;
struct nvmet_ctrl *ctrl;
unsigned long idx;
ctrl = req->sq->ctrl;
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
/* we don't have the right data for file backed ns */
if (!ns->bdev)
continue;
......@@ -127,9 +126,7 @@ static u16 nvmet_get_smart_log_all(struct nvmet_req *req,
host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]);
data_units_written += DIV_ROUND_UP(
part_stat_read(ns->bdev->bd_part, sectors[WRITE]), 1000);
}
rcu_read_unlock();
put_unaligned_le64(host_reads, &slog->host_reads[0]);
put_unaligned_le64(data_units_read, &slog->data_units_read[0]);
......@@ -230,14 +227,13 @@ static u32 nvmet_format_ana_group(struct nvmet_req *req, u32 grpid,
{
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
unsigned long idx;
u32 count = 0;
if (!(req->cmd->get_log_page.lsp & NVME_ANA_LOG_RGO)) {
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
if (ns->anagrpid == grpid)
desc->nsids[count++] = cpu_to_le32(ns->nsid);
rcu_read_unlock();
}
desc->grpid = cpu_to_le32(grpid);
......@@ -556,6 +552,7 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
static const int buf_size = NVME_IDENTIFY_DATA_SIZE;
struct nvmet_ctrl *ctrl = req->sq->ctrl;
struct nvmet_ns *ns;
unsigned long idx;
u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid);
__le32 *list;
u16 status = 0;
......@@ -567,15 +564,13 @@ static void nvmet_execute_identify_nslist(struct nvmet_req *req)
goto out;
}
rcu_read_lock();
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
xa_for_each(&ctrl->subsys->namespaces, idx, ns) {
if (ns->nsid <= min_nsid)
continue;
list[i++] = cpu_to_le32(ns->nsid);
if (i == buf_size / sizeof(__le32))
break;
}
rcu_read_unlock();
status = nvmet_copy_to_sgl(req, 0, list, buf_size);
......@@ -754,7 +749,7 @@ u16 nvmet_set_feat_async_event(struct nvmet_req *req, u32 mask)
return 0;
}
static void nvmet_execute_set_features(struct nvmet_req *req)
void nvmet_execute_set_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
......@@ -829,7 +824,7 @@ void nvmet_get_feat_async_event(struct nvmet_req *req)
nvmet_set_result(req, READ_ONCE(req->sq->ctrl->aen_enabled));
}
static void nvmet_execute_get_features(struct nvmet_req *req)
void nvmet_execute_get_features(struct nvmet_req *req)
{
struct nvmet_subsys *subsys = req->sq->ctrl->subsys;
u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10);
......@@ -945,6 +940,9 @@ u16 nvmet_parse_admin_cmd(struct nvmet_req *req)
if (unlikely(ret))
return ret;
if (nvmet_req_passthru_ctrl(req))
return nvmet_parse_passthru_admin_cmd(req);
switch (cmd->common.opcode) {
case nvme_admin_get_log_page:
req->execute = nvmet_execute_get_log_page;
......
......@@ -666,6 +666,103 @@ static const struct config_item_type nvmet_namespaces_type = {
.ct_owner = THIS_MODULE,
};
#ifdef CONFIG_NVME_TARGET_PASSTHRU
static ssize_t nvmet_passthru_device_path_show(struct config_item *item,
char *page)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
return snprintf(page, PAGE_SIZE, "%s\n", subsys->passthru_ctrl_path);
}
static ssize_t nvmet_passthru_device_path_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
size_t len;
int ret;
mutex_lock(&subsys->lock);
ret = -EBUSY;
if (subsys->passthru_ctrl)
goto out_unlock;
ret = -EINVAL;
len = strcspn(page, "\n");
if (!len)
goto out_unlock;
kfree(subsys->passthru_ctrl_path);
ret = -ENOMEM;
subsys->passthru_ctrl_path = kstrndup(page, len, GFP_KERNEL);
if (!subsys->passthru_ctrl_path)
goto out_unlock;
mutex_unlock(&subsys->lock);
return count;
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
}
CONFIGFS_ATTR(nvmet_passthru_, device_path);
static ssize_t nvmet_passthru_enable_show(struct config_item *item,
char *page)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
return sprintf(page, "%d\n", subsys->passthru_ctrl ? 1 : 0);
}
static ssize_t nvmet_passthru_enable_store(struct config_item *item,
const char *page, size_t count)
{
struct nvmet_subsys *subsys = to_subsys(item->ci_parent);
bool enable;
int ret = 0;
if (strtobool(page, &enable))
return -EINVAL;
if (enable)
ret = nvmet_passthru_ctrl_enable(subsys);
else
nvmet_passthru_ctrl_disable(subsys);
return ret ? ret : count;
}
CONFIGFS_ATTR(nvmet_passthru_, enable);
static struct configfs_attribute *nvmet_passthru_attrs[] = {
&nvmet_passthru_attr_device_path,
&nvmet_passthru_attr_enable,
NULL,
};
static const struct config_item_type nvmet_passthru_type = {
.ct_attrs = nvmet_passthru_attrs,
.ct_owner = THIS_MODULE,
};
static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
{
config_group_init_type_name(&subsys->passthru_group,
"passthru", &nvmet_passthru_type);
configfs_add_default_group(&subsys->passthru_group,
&subsys->group);
}
#else /* CONFIG_NVME_TARGET_PASSTHRU */
static void nvmet_add_passthru_group(struct nvmet_subsys *subsys)
{
}
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
static int nvmet_port_subsys_allow_link(struct config_item *parent,
struct config_item *target)
{
......@@ -879,6 +976,10 @@ static ssize_t nvmet_subsys_attr_version_store(struct config_item *item,
int major, minor, tertiary = 0;
int ret;
/* passthru subsystems use the underlying controller's version */
if (nvmet_passthru_ctrl(subsys))
return -EINVAL;
ret = sscanf(page, "%d.%d.%d\n", &major, &minor, &tertiary);
if (ret != 2 && ret != 3)
return -EINVAL;
......@@ -1121,6 +1222,8 @@ static struct config_group *nvmet_subsys_make(struct config_group *group,
configfs_add_default_group(&subsys->allowed_hosts_group,
&subsys->group);
nvmet_add_passthru_group(subsys);
return &subsys->group;
}
......
......@@ -115,13 +115,14 @@ u16 nvmet_zero_sgl(struct nvmet_req *req, off_t off, size_t len)
static unsigned int nvmet_max_nsid(struct nvmet_subsys *subsys)
{
struct nvmet_ns *ns;
unsigned long nsid = 0;
struct nvmet_ns *cur;
unsigned long idx;
if (list_empty(&subsys->namespaces))
return 0;
xa_for_each(&subsys->namespaces, idx, cur)
nsid = cur->nsid;
ns = list_last_entry(&subsys->namespaces, struct nvmet_ns, dev_link);
return ns->nsid;
return nsid;
}
static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
......@@ -410,28 +411,13 @@ static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl)
cancel_delayed_work_sync(&ctrl->ka_work);
}
static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl,
__le32 nsid)
{
struct nvmet_ns *ns;
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) {
if (ns->nsid == le32_to_cpu(nsid))
return ns;
}
return NULL;
}
struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid)
{
struct nvmet_ns *ns;
rcu_read_lock();
ns = __nvmet_find_namespace(ctrl, nsid);
ns = xa_load(&ctrl->subsys->namespaces, le32_to_cpu(nsid));
if (ns)
percpu_ref_get(&ns->ref);
rcu_read_unlock();
return ns;
}
......@@ -558,6 +544,12 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
mutex_lock(&subsys->lock);
ret = 0;
if (nvmet_passthru_ctrl(subsys)) {
pr_info("cannot enable both passthru and regular namespaces for a single subsystem");
goto out_unlock;
}
if (ns->enabled)
goto out_unlock;
......@@ -586,24 +578,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
if (ns->nsid > subsys->max_nsid)
subsys->max_nsid = ns->nsid;
/*
* The namespaces list needs to be sorted to simplify the implementation
* of the Identify Namepace List subcommand.
*/
if (list_empty(&subsys->namespaces)) {
list_add_tail_rcu(&ns->dev_link, &subsys->namespaces);
} else {
struct nvmet_ns *old;
list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
lockdep_is_held(&subsys->lock)) {
BUG_ON(ns->nsid == old->nsid);
if (ns->nsid < old->nsid)
break;
}
ret = xa_insert(&subsys->namespaces, ns->nsid, ns, GFP_KERNEL);
if (ret)
goto out_restore_subsys_maxnsid;
list_add_tail_rcu(&ns->dev_link, &old->dev_link);
}
subsys->nr_namespaces++;
nvmet_ns_changed(subsys, ns->nsid);
......@@ -612,6 +590,10 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
out_unlock:
mutex_unlock(&subsys->lock);
return ret;
out_restore_subsys_maxnsid:
subsys->max_nsid = nvmet_max_nsid(subsys);
percpu_ref_exit(&ns->ref);
out_dev_put:
list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
pci_dev_put(radix_tree_delete(&ctrl->p2p_ns_map, ns->nsid));
......@@ -630,7 +612,7 @@ void nvmet_ns_disable(struct nvmet_ns *ns)
goto out_unlock;
ns->enabled = false;
list_del_rcu(&ns->dev_link);
xa_erase(&ns->subsys->namespaces, ns->nsid);
if (ns->nsid == subsys->max_nsid)
subsys->max_nsid = nvmet_max_nsid(subsys);
......@@ -681,7 +663,6 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid)
if (!ns)
return NULL;
INIT_LIST_HEAD(&ns->dev_link);
init_completion(&ns->disable_done);
ns->nsid = nsid;
......@@ -874,6 +855,9 @@ static u16 nvmet_parse_io_cmd(struct nvmet_req *req)
if (unlikely(ret))
return ret;
if (nvmet_req_passthru_ctrl(req))
return nvmet_parse_passthru_io_cmd(req);
req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid);
if (unlikely(!req->ns)) {
req->error_loc = offsetof(struct nvme_common_command, nsid);
......@@ -1263,14 +1247,14 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
struct nvmet_req *req)
{
struct nvmet_ns *ns;
unsigned long idx;
if (!req->p2p_client)
return;
ctrl->p2p_client = get_device(req->p2p_client);
list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
lockdep_is_held(&ctrl->subsys->lock))
xa_for_each(&ctrl->subsys->namespaces, idx, ns)
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
}
......@@ -1495,7 +1479,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
if (!subsys)
return ERR_PTR(-ENOMEM);
subsys->ver = NVME_VS(1, 3, 0); /* NVMe 1.3.0 */
subsys->ver = NVMET_DEFAULT_VS;
/* generate a random serial number as our controllers are ephemeral: */
get_random_bytes(&subsys->serial, sizeof(subsys->serial));
......@@ -1523,7 +1507,7 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
kref_init(&subsys->ref);
mutex_init(&subsys->lock);
INIT_LIST_HEAD(&subsys->namespaces);
xa_init(&subsys->namespaces);
INIT_LIST_HEAD(&subsys->ctrls);
INIT_LIST_HEAD(&subsys->hosts);
......@@ -1535,7 +1519,10 @@ static void nvmet_subsys_free(struct kref *ref)
struct nvmet_subsys *subsys =
container_of(ref, struct nvmet_subsys, ref);
WARN_ON_ONCE(!list_empty(&subsys->namespaces));
WARN_ON_ONCE(!xa_empty(&subsys->namespaces));
xa_destroy(&subsys->namespaces);
nvmet_passthru_subsys_free(subsys);
kfree(subsys->subsysnqn);
kfree_rcu(subsys->model, rcuhead);
......
......@@ -167,7 +167,6 @@ struct nvmet_fc_tgt_assoc {
struct nvmet_fc_tgt_queue *queues[NVMET_NR_QUEUES + 1];
struct kref ref;
struct work_struct del_work;
atomic_t del_work_active;
};
......@@ -1090,7 +1089,6 @@ nvmet_fc_delete_assoc(struct work_struct *work)
container_of(work, struct nvmet_fc_tgt_assoc, del_work);
nvmet_fc_delete_target_assoc(assoc);
atomic_set(&assoc->del_work_active, 0);
nvmet_fc_tgt_a_put(assoc);
}
......@@ -1123,7 +1121,6 @@ nvmet_fc_alloc_target_assoc(struct nvmet_fc_tgtport *tgtport, void *hosthandle)
INIT_LIST_HEAD(&assoc->a_list);
kref_init(&assoc->ref);
INIT_WORK(&assoc->del_work, nvmet_fc_delete_assoc);
atomic_set(&assoc->del_work_active, 0);
atomic_set(&assoc->terminating, 0);
while (needrandom) {
......@@ -1243,7 +1240,8 @@ nvmet_fc_find_target_assoc(struct nvmet_fc_tgtport *tgtport,
list_for_each_entry(assoc, &tgtport->assoc_list, a_list) {
if (association_id == assoc->association_id) {
ret = assoc;
nvmet_fc_tgt_a_get(assoc);
if (!nvmet_fc_tgt_a_get(assoc))
ret = NULL;
break;
}
}
......@@ -1477,22 +1475,16 @@ __nvmet_fc_free_assocs(struct nvmet_fc_tgtport *tgtport)
{
struct nvmet_fc_tgt_assoc *assoc, *next;
unsigned long flags;
int ret;
spin_lock_irqsave(&tgtport->lock, flags);
list_for_each_entry_safe(assoc, next,
&tgtport->assoc_list, a_list) {
if (!nvmet_fc_tgt_a_get(assoc))
continue;
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
if (ret == 0) {
if (!schedule_work(&assoc->del_work))
nvmet_fc_tgt_a_put(assoc);
} else {
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
}
}
spin_unlock_irqrestore(&tgtport->lock, flags);
}
......@@ -1533,7 +1525,6 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
struct nvmet_fc_tgt_assoc *assoc, *next;
unsigned long flags;
bool noassoc = true;
int ret;
spin_lock_irqsave(&tgtport->lock, flags);
list_for_each_entry_safe(assoc, next,
......@@ -1545,15 +1536,10 @@ nvmet_fc_invalidate_host(struct nvmet_fc_target_port *target_port,
continue;
assoc->hostport->invalid = 1;
noassoc = false;
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
if (ret == 0) {
if (!schedule_work(&assoc->del_work))
nvmet_fc_tgt_a_put(assoc);
} else {
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
}
}
spin_unlock_irqrestore(&tgtport->lock, flags);
/* if there's nothing to wait for - call the callback */
......@@ -1573,7 +1559,6 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
struct nvmet_fc_tgt_queue *queue;
unsigned long flags;
bool found_ctrl = false;
int ret;
/* this is a bit ugly, but don't want to make locks layered */
spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
......@@ -1597,14 +1582,9 @@ nvmet_fc_delete_ctrl(struct nvmet_ctrl *ctrl)
nvmet_fc_tgtport_put(tgtport);
if (found_ctrl) {
ret = atomic_cmpxchg(&assoc->del_work_active, 0, 1);
if (ret == 0) {
if (!schedule_work(&assoc->del_work))
nvmet_fc_tgt_a_put(assoc);
} else {
/* already deleting - release local reference */
nvmet_fc_tgt_a_put(assoc);
}
return;
}
......
......@@ -444,7 +444,6 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
{
struct nvme_loop_ctrl *ctrl =
container_of(work, struct nvme_loop_ctrl, ctrl.reset_work);
bool changed;
int ret;
nvme_stop_ctrl(&ctrl->ctrl);
......@@ -471,8 +470,8 @@ static void nvme_loop_reset_ctrl_work(struct work_struct *work)
blk_mq_update_nr_hw_queues(&ctrl->tag_set,
ctrl->ctrl.queue_count - 1);
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
WARN_ON_ONCE(1);
nvme_start_ctrl(&ctrl->ctrl);
......@@ -567,7 +566,6 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
struct nvmf_ctrl_options *opts)
{
struct nvme_loop_ctrl *ctrl;
bool changed;
int ret;
ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL);
......@@ -583,6 +581,9 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
if (ret)
goto out_put_ctrl;
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
WARN_ON_ONCE(1);
ret = -ENOMEM;
ctrl->ctrl.sqsize = opts->queue_size - 1;
......@@ -617,8 +618,8 @@ static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev,
dev_info(ctrl->ctrl.device,
"new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn);
changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
WARN_ON_ONCE(!changed);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
WARN_ON_ONCE(1);
mutex_lock(&nvme_loop_ctrl_mutex);
list_add_tail(&ctrl->list, &nvme_loop_ctrl_list);
......
......@@ -21,6 +21,8 @@
#include <linux/radix-tree.h>
#include <linux/t10-pi.h>
#define NVMET_DEFAULT_VS NVME_VS(1, 3, 0)
#define NVMET_ASYNC_EVENTS 4
#define NVMET_ERROR_LOG_SLOTS 128
#define NVMET_NO_ERROR_LOC ((u16)-1)
......@@ -52,7 +54,6 @@
(cpu_to_le32(offsetof(struct nvmf_connect_command, x)))
struct nvmet_ns {
struct list_head dev_link;
struct percpu_ref ref;
struct block_device *bdev;
struct file *file;
......@@ -219,7 +220,7 @@ struct nvmet_subsys {
struct mutex lock;
struct kref ref;
struct list_head namespaces;
struct xarray namespaces;
unsigned int nr_namespaces;
unsigned int max_nsid;
u16 cntlid_min;
......@@ -243,6 +244,12 @@ struct nvmet_subsys {
struct config_group allowed_hosts_group;
struct nvmet_subsys_model __rcu *model;
#ifdef CONFIG_NVME_TARGET_PASSTHRU
struct nvme_ctrl *passthru_ctrl;
char *passthru_ctrl_path;
struct config_group passthru_group;
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
};
static inline struct nvmet_subsys *to_subsys(struct config_item *item)
......@@ -322,6 +329,11 @@ struct nvmet_req {
struct bio_vec *bvec;
struct work_struct work;
} f;
struct {
struct request *rq;
struct work_struct work;
bool use_workqueue;
} p;
};
int sg_cnt;
int metadata_sg_cnt;
......@@ -401,6 +413,8 @@ void nvmet_req_complete(struct nvmet_req *req, u16 status);
int nvmet_req_alloc_sgls(struct nvmet_req *req);
void nvmet_req_free_sgls(struct nvmet_req *req);
void nvmet_execute_set_features(struct nvmet_req *req);
void nvmet_execute_get_features(struct nvmet_req *req);
void nvmet_execute_keep_alive(struct nvmet_req *req);
void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
......@@ -533,6 +547,43 @@ static inline u32 nvmet_dsm_len(struct nvmet_req *req)
sizeof(struct nvme_dsm_range);
}
#ifdef CONFIG_NVME_TARGET_PASSTHRU
void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys);
int nvmet_passthru_ctrl_enable(struct nvmet_subsys *subsys);
void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys);
u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req);
u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req);
static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
{
return subsys->passthru_ctrl;
}
#else /* CONFIG_NVME_TARGET_PASSTHRU */
static inline void nvmet_passthru_subsys_free(struct nvmet_subsys *subsys)
{
}
static inline void nvmet_passthru_ctrl_disable(struct nvmet_subsys *subsys)
{
}
static inline u16 nvmet_parse_passthru_admin_cmd(struct nvmet_req *req)
{
return 0;
}
static inline u16 nvmet_parse_passthru_io_cmd(struct nvmet_req *req)
{
return 0;
}
static inline struct nvme_ctrl *nvmet_passthru_ctrl(struct nvmet_subsys *subsys)
{
return NULL;
}
#endif /* CONFIG_NVME_TARGET_PASSTHRU */
static inline struct nvme_ctrl *
nvmet_req_passthru_ctrl(struct nvmet_req *req)
{
return nvmet_passthru_ctrl(req->sq->ctrl->subsys);
}
u16 errno_to_nvme_status(struct nvmet_req *req, int errno);
/* Convert a 32-bit number to a 16-bit 0's based number */
......
This diff is collapsed.
......@@ -752,7 +752,7 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
struct nvmet_rdma_queue *queue = cq->cq_context;
struct nvmet_rdma_queue *queue = wc->qp->qp_context;
u16 status = 0;
WARN_ON(rsp->n_rdma <= 0);
......@@ -1008,7 +1008,7 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
struct nvmet_rdma_cmd *cmd =
container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe);
struct nvmet_rdma_queue *queue = cq->cq_context;
struct nvmet_rdma_queue *queue = wc->qp->qp_context;
struct nvmet_rdma_rsp *rsp;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
......@@ -1258,9 +1258,8 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
*/
nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
queue->cq = ib_alloc_cq(ndev->device, queue,
nr_cqe + 1, queue->comp_vector,
IB_POLL_WORKQUEUE);
queue->cq = ib_cq_pool_get(ndev->device, nr_cqe + 1,
queue->comp_vector, IB_POLL_WORKQUEUE);
if (IS_ERR(queue->cq)) {
ret = PTR_ERR(queue->cq);
pr_err("failed to create CQ cqe= %d ret= %d\n",
......@@ -1322,7 +1321,7 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
err_destroy_qp:
rdma_destroy_qp(queue->cm_id);
err_destroy_cq:
ib_free_cq(queue->cq);
ib_cq_pool_put(queue->cq, nr_cqe + 1);
goto out;
}
......@@ -1332,7 +1331,8 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
if (queue->cm_id)
rdma_destroy_id(queue->cm_id);
ib_destroy_qp(queue->qp);
ib_free_cq(queue->cq);
ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 *
queue->send_queue_size + 1);
}
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
......
......@@ -672,7 +672,7 @@ enum {
* Values set by the LLDD indicating completion status of the FCP operation.
* Must be set prior to calling the done() callback.
* @transferred_length: amount of DATA_OUT payload data received by a
* a WRITEDATA operation. If not a WRITEDATA operation, value must
* WRITEDATA operation. If not a WRITEDATA operation, value must
* be set to 0. Should equal transfer_length on success.
* @fcp_error: status of the FCP operation. Must be 0 on success; on failure
* must be a NVME_SC_FC_xxxx value.
......
......@@ -312,6 +312,7 @@ enum {
NVME_CTRL_ONCS_WRITE_UNCORRECTABLE = 1 << 1,
NVME_CTRL_ONCS_DSM = 1 << 2,
NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
NVME_CTRL_ONCS_RESERVATIONS = 1 << 5,
NVME_CTRL_ONCS_TIMESTAMP = 1 << 6,
NVME_CTRL_VWC_PRESENT = 1 << 0,
NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
......@@ -982,6 +983,7 @@ enum nvme_admin_opcode {
nvme_admin_security_recv = 0x82,
nvme_admin_sanitize_nvm = 0x84,
nvme_admin_get_lba_status = 0x86,
nvme_admin_vendor_start = 0xC0,
};
#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
......@@ -1045,6 +1047,8 @@ enum {
NVME_FEAT_RESV_MASK = 0x82,
NVME_FEAT_RESV_PERSIST = 0x83,
NVME_FEAT_WRITE_PROTECT = 0x84,
NVME_FEAT_VENDOR_START = 0xC0,
NVME_FEAT_VENDOR_END = 0xFF,
NVME_LOG_ERROR = 0x01,
NVME_LOG_SMART = 0x02,
NVME_LOG_FW_SLOT = 0x03,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment