Commit 14d3a3b2 authored by Christoph Hellwig's avatar Christoph Hellwig

IB: add a proper completion queue abstraction

This adds an abstraction that allows ULPs to simply pass a completion
object and completion callback with each submitted WR and let the RDMA
core handle the nitty gritty details of how to handle completion
interrupts and poll the CQ.

In detail there is a new ib_cqe structure which just contains the
completion callback, and which can be used to get at the containing
object using container_of.  It is pointed to by the WR and WC as an
alternative to the wr_id field, similar to how many ULPs already use
the field to store a pointer using casts.

A driver using the new completion callbacks allocates it's CQs using
the new ib_create_cq API, which in addition to the number of CQEs and
the completion vectors also takes a mode on how we poll for CQEs.
Three modes are available: direct for drivers that never take CQ
interrupts and just poll for them, softirq to poll from softirq context
using the to be renamed blk-iopoll infrastructure which takes care of
rearming and budgeting, or a workqueue for consumer who want to be
called from user context.

Thanks a lot to Sagi Grimberg who helped reviewing the API, wrote
the current version of the workqueue code because my two previous
attempts sucked too much and converted the iSER initiator to the new
API.
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent 839a301d
...@@ -5,6 +5,7 @@ menuconfig INFINIBAND ...@@ -5,6 +5,7 @@ menuconfig INFINIBAND
depends on NET depends on NET
depends on INET depends on INET
depends on m || IPV6 != m depends on m || IPV6 != m
select IRQ_POLL
---help--- ---help---
Core support for InfiniBand (IB). Make sure to also select Core support for InfiniBand (IB). Make sure to also select
any protocols you wish to use as well as drivers for your any protocols you wish to use as well as drivers for your
......
...@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o ...@@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o
obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \
$(user_access-y) $(user_access-y)
ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \
device.o fmr_pool.o cache.o netlink.o \ device.o fmr_pool.o cache.o netlink.o \
roce_gid_mgmt.o roce_gid_mgmt.o
ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
......
/*
* Copyright (c) 2015 HGST, a Western Digital Company.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*/
#include <linux/module.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <rdma/ib_verbs.h>
/* # of WCs to poll for with a single call to ib_poll_cq */
#define IB_POLL_BATCH 16
/* # of WCs to iterate over before yielding */
#define IB_POLL_BUDGET_IRQ 256
#define IB_POLL_BUDGET_WORKQUEUE 65536
#define IB_POLL_FLAGS \
(IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS)
static int __ib_process_cq(struct ib_cq *cq, int budget)
{
int i, n, completed = 0;
while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) {
for (i = 0; i < n; i++) {
struct ib_wc *wc = &cq->wc[i];
if (wc->wr_cqe)
wc->wr_cqe->done(cq, wc);
else
WARN_ON_ONCE(wc->status == IB_WC_SUCCESS);
}
completed += n;
if (n != IB_POLL_BATCH ||
(budget != -1 && completed >= budget))
break;
}
return completed;
}
/**
* ib_process_direct_cq - process a CQ in caller context
* @cq: CQ to process
* @budget: number of CQEs to poll for
*
* This function is used to process all outstanding CQ entries on a
* %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different
* context and does not ask for completion interrupts from the HCA.
*
* Note: for compatibility reasons -1 can be passed in %budget for unlimited
* polling. Do not use this feature in new code, it will be removed soon.
*/
int ib_process_cq_direct(struct ib_cq *cq, int budget)
{
WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT);
return __ib_process_cq(cq, budget);
}
EXPORT_SYMBOL(ib_process_cq_direct);
static void ib_cq_completion_direct(struct ib_cq *cq, void *private)
{
WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq);
}
static int ib_poll_handler(struct irq_poll *iop, int budget)
{
struct ib_cq *cq = container_of(iop, struct ib_cq, iop);
int completed;
completed = __ib_process_cq(cq, budget);
if (completed < budget) {
irq_poll_complete(&cq->iop);
if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
irq_poll_sched(&cq->iop);
}
return completed;
}
static void ib_cq_completion_softirq(struct ib_cq *cq, void *private)
{
irq_poll_sched(&cq->iop);
}
static void ib_cq_poll_work(struct work_struct *work)
{
struct ib_cq *cq = container_of(work, struct ib_cq, work);
int completed;
completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE);
if (completed >= IB_POLL_BUDGET_WORKQUEUE ||
ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0)
queue_work(ib_comp_wq, &cq->work);
}
static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
{
queue_work(ib_comp_wq, &cq->work);
}
/**
* ib_alloc_cq - allocate a completion queue
* @dev: device to allocate the CQ for
* @private: driver private data, accessible from cq->cq_context
* @nr_cqe: number of CQEs to allocate
* @comp_vector: HCA completion vectors for this CQ
* @poll_ctx: context to poll the CQ from.
*
* This is the proper interface to allocate a CQ for in-kernel users. A
* CQ allocated with this interface will automatically be polled from the
* specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id
* to use this CQ abstraction.
*/
struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx)
{
struct ib_cq_init_attr cq_attr = {
.cqe = nr_cqe,
.comp_vector = comp_vector,
};
struct ib_cq *cq;
int ret = -ENOMEM;
cq = dev->create_cq(dev, &cq_attr, NULL, NULL);
if (IS_ERR(cq))
return cq;
cq->device = dev;
cq->uobject = NULL;
cq->event_handler = NULL;
cq->cq_context = private;
cq->poll_ctx = poll_ctx;
atomic_set(&cq->usecnt, 0);
cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL);
if (!cq->wc)
goto out_destroy_cq;
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
cq->comp_handler = ib_cq_completion_direct;
break;
case IB_POLL_SOFTIRQ:
cq->comp_handler = ib_cq_completion_softirq;
irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler);
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
break;
case IB_POLL_WORKQUEUE:
cq->comp_handler = ib_cq_completion_workqueue;
INIT_WORK(&cq->work, ib_cq_poll_work);
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
break;
default:
ret = -EINVAL;
goto out_free_wc;
}
return cq;
out_free_wc:
kfree(cq->wc);
out_destroy_cq:
cq->device->destroy_cq(cq);
return ERR_PTR(ret);
}
EXPORT_SYMBOL(ib_alloc_cq);
/**
* ib_free_cq - free a completion queue
* @cq: completion queue to free.
*/
void ib_free_cq(struct ib_cq *cq)
{
int ret;
if (WARN_ON_ONCE(atomic_read(&cq->usecnt)))
return;
switch (cq->poll_ctx) {
case IB_POLL_DIRECT:
break;
case IB_POLL_SOFTIRQ:
irq_poll_disable(&cq->iop);
break;
case IB_POLL_WORKQUEUE:
flush_work(&cq->work);
break;
default:
WARN_ON_ONCE(1);
}
kfree(cq->wc);
ret = cq->device->destroy_cq(cq);
WARN_ON_ONCE(ret);
}
EXPORT_SYMBOL(ib_free_cq);
...@@ -58,6 +58,7 @@ struct ib_client_data { ...@@ -58,6 +58,7 @@ struct ib_client_data {
bool going_down; bool going_down;
}; };
struct workqueue_struct *ib_comp_wq;
struct workqueue_struct *ib_wq; struct workqueue_struct *ib_wq;
EXPORT_SYMBOL_GPL(ib_wq); EXPORT_SYMBOL_GPL(ib_wq);
...@@ -954,10 +955,18 @@ static int __init ib_core_init(void) ...@@ -954,10 +955,18 @@ static int __init ib_core_init(void)
if (!ib_wq) if (!ib_wq)
return -ENOMEM; return -ENOMEM;
ib_comp_wq = alloc_workqueue("ib-comp-wq",
WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM,
WQ_UNBOUND_MAX_ACTIVE);
if (!ib_comp_wq) {
ret = -ENOMEM;
goto err;
}
ret = class_register(&ib_class); ret = class_register(&ib_class);
if (ret) { if (ret) {
printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
goto err; goto err_comp;
} }
ret = ibnl_init(); ret = ibnl_init();
...@@ -972,7 +981,8 @@ static int __init ib_core_init(void) ...@@ -972,7 +981,8 @@ static int __init ib_core_init(void)
err_sysfs: err_sysfs:
class_unregister(&ib_class); class_unregister(&ib_class);
err_comp:
destroy_workqueue(ib_comp_wq);
err: err:
destroy_workqueue(ib_wq); destroy_workqueue(ib_wq);
return ret; return ret;
...@@ -983,6 +993,7 @@ static void __exit ib_core_cleanup(void) ...@@ -983,6 +993,7 @@ static void __exit ib_core_cleanup(void)
ib_cache_cleanup(); ib_cache_cleanup();
ibnl_cleanup(); ibnl_cleanup();
class_unregister(&ib_class); class_unregister(&ib_class);
destroy_workqueue(ib_comp_wq);
/* Make sure that any pending umem accounting work is done. */ /* Make sure that any pending umem accounting work is done. */
destroy_workqueue(ib_wq); destroy_workqueue(ib_wq);
} }
......
...@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = { ...@@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = {
#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
static struct ib_send_wr ipoib_cm_rx_drain_wr = { static struct ib_send_wr ipoib_cm_rx_drain_wr = {
.wr_id = IPOIB_CM_RX_DRAIN_WRID,
.opcode = IB_WR_SEND, .opcode = IB_WR_SEND,
}; };
...@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) ...@@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
* error" WC will be immediately generated for each WR we post. * error" WC will be immediately generated for each WR we post.
*/ */
p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID;
if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
ipoib_warn(priv, "failed to post drain wr\n"); ipoib_warn(priv, "failed to post drain wr\n");
......
...@@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) ...@@ -457,10 +457,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
static void srp_destroy_qp(struct srp_rdma_ch *ch) static void srp_destroy_qp(struct srp_rdma_ch *ch)
{ {
static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID }; static struct ib_recv_wr wr = { 0 };
struct ib_recv_wr *bad_wr; struct ib_recv_wr *bad_wr;
int ret; int ret;
wr.wr_id = SRP_LAST_WR_ID;
/* Destroying a QP and reusing ch->done is only safe if not connected */ /* Destroying a QP and reusing ch->done is only safe if not connected */
WARN_ON_ONCE(ch->connected); WARN_ON_ONCE(ch->connected);
...@@ -1042,13 +1043,14 @@ static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey) ...@@ -1042,13 +1043,14 @@ static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
struct ib_send_wr *bad_wr; struct ib_send_wr *bad_wr;
struct ib_send_wr wr = { struct ib_send_wr wr = {
.opcode = IB_WR_LOCAL_INV, .opcode = IB_WR_LOCAL_INV,
.wr_id = LOCAL_INV_WR_ID_MASK,
.next = NULL, .next = NULL,
.num_sge = 0, .num_sge = 0,
.send_flags = 0, .send_flags = 0,
.ex.invalidate_rkey = rkey, .ex.invalidate_rkey = rkey,
}; };
wr.wr_id = LOCAL_INV_WR_ID_MASK;
return ib_post_send(ch->qp, &wr, &bad_wr); return ib_post_send(ch->qp, &wr, &bad_wr);
} }
......
...@@ -49,6 +49,7 @@ ...@@ -49,6 +49,7 @@
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/socket.h> #include <linux/socket.h>
#include <linux/irq_poll.h>
#include <uapi/linux/if_ether.h> #include <uapi/linux/if_ether.h>
#include <linux/atomic.h> #include <linux/atomic.h>
...@@ -56,6 +57,7 @@ ...@@ -56,6 +57,7 @@
#include <asm/uaccess.h> #include <asm/uaccess.h>
extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_wq;
extern struct workqueue_struct *ib_comp_wq;
union ib_gid { union ib_gid {
u8 raw[16]; u8 raw[16];
...@@ -758,7 +760,10 @@ enum ib_wc_flags { ...@@ -758,7 +760,10 @@ enum ib_wc_flags {
}; };
struct ib_wc { struct ib_wc {
u64 wr_id; union {
u64 wr_id;
struct ib_cqe *wr_cqe;
};
enum ib_wc_status status; enum ib_wc_status status;
enum ib_wc_opcode opcode; enum ib_wc_opcode opcode;
u32 vendor_err; u32 vendor_err;
...@@ -1079,9 +1084,16 @@ struct ib_mw_bind_info { ...@@ -1079,9 +1084,16 @@ struct ib_mw_bind_info {
int mw_access_flags; int mw_access_flags;
}; };
struct ib_cqe {
void (*done)(struct ib_cq *cq, struct ib_wc *wc);
};
struct ib_send_wr { struct ib_send_wr {
struct ib_send_wr *next; struct ib_send_wr *next;
u64 wr_id; union {
u64 wr_id;
struct ib_cqe *wr_cqe;
};
struct ib_sge *sg_list; struct ib_sge *sg_list;
int num_sge; int num_sge;
enum ib_wr_opcode opcode; enum ib_wr_opcode opcode;
...@@ -1175,7 +1187,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) ...@@ -1175,7 +1187,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr)
struct ib_recv_wr { struct ib_recv_wr {
struct ib_recv_wr *next; struct ib_recv_wr *next;
u64 wr_id; union {
u64 wr_id;
struct ib_cqe *wr_cqe;
};
struct ib_sge *sg_list; struct ib_sge *sg_list;
int num_sge; int num_sge;
}; };
...@@ -1306,6 +1321,12 @@ struct ib_ah { ...@@ -1306,6 +1321,12 @@ struct ib_ah {
typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context);
enum ib_poll_context {
IB_POLL_DIRECT, /* caller context, no hw completions */
IB_POLL_SOFTIRQ, /* poll from softirq context */
IB_POLL_WORKQUEUE, /* poll from workqueue */
};
struct ib_cq { struct ib_cq {
struct ib_device *device; struct ib_device *device;
struct ib_uobject *uobject; struct ib_uobject *uobject;
...@@ -1314,6 +1335,12 @@ struct ib_cq { ...@@ -1314,6 +1335,12 @@ struct ib_cq {
void *cq_context; void *cq_context;
int cqe; int cqe;
atomic_t usecnt; /* count number of work queues */ atomic_t usecnt; /* count number of work queues */
enum ib_poll_context poll_ctx;
struct ib_wc *wc;
union {
struct irq_poll iop;
struct work_struct work;
};
}; };
struct ib_srq { struct ib_srq {
...@@ -2453,6 +2480,11 @@ static inline int ib_post_recv(struct ib_qp *qp, ...@@ -2453,6 +2480,11 @@ static inline int ib_post_recv(struct ib_qp *qp,
return qp->device->post_recv(qp, recv_wr, bad_recv_wr); return qp->device->post_recv(qp, recv_wr, bad_recv_wr);
} }
struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx);
void ib_free_cq(struct ib_cq *cq);
int ib_process_cq_direct(struct ib_cq *cq, int budget);
/** /**
* ib_create_cq - Creates a CQ on the specified device. * ib_create_cq - Creates a CQ on the specified device.
* @device: The device on which to create the CQ. * @device: The device on which to create the CQ.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment