Commit 6aabfa76 authored by Sagi Grimberg's avatar Sagi Grimberg Committed by Roland Dreier

IB/iser: Use single CQ for RX and TX

This will solve a possible condition where we might miss TX completion
(flush error) during session teardown.  Since we are using a single
CQ, we don't need to actively drain the TX CQ, instead just wait for
flush_completion (when counters reach zero) and remove iser_poll_for_flush_errors().

This patch might introduce a minor performance regression on its own,
but the next patches will enhance performance using a single CQ for RX
and TX.
Signed-off-by: default avatarSagi Grimberg <sagig@mellanox.com>
Signed-off-by: default avatarOr Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: default avatarRoland Dreier <roland@purestorage.com>
parent 183cfa43
...@@ -271,16 +271,14 @@ struct iscsi_iser_task; ...@@ -271,16 +271,14 @@ struct iscsi_iser_task;
* struct iser_comp - iSER completion context * struct iser_comp - iSER completion context
* *
* @device: pointer to device handle * @device: pointer to device handle
* @rx_cq: RX completion queue * @cq: completion queue
* @tx_cq: TX completion queue
* @tasklet: Tasklet handle * @tasklet: Tasklet handle
* @active_qps: Number of active QPs attached * @active_qps: Number of active QPs attached
* to completion context * to completion context
*/ */
struct iser_comp { struct iser_comp {
struct iser_device *device; struct iser_device *device;
struct ib_cq *rx_cq; struct ib_cq *cq;
struct ib_cq *tx_cq;
struct tasklet_struct tasklet; struct tasklet_struct tasklet;
int active_qps; int active_qps;
}; };
...@@ -342,6 +340,7 @@ struct fast_reg_descriptor { ...@@ -342,6 +340,7 @@ struct fast_reg_descriptor {
* @device: reference to iser device * @device: reference to iser device
* @comp: iser completion context * @comp: iser completion context
* @pi_support: Indicate device T10-PI support * @pi_support: Indicate device T10-PI support
* @flush_comp: completes when all connection completions consumed
* @lock: protects fmr/fastreg pool * @lock: protects fmr/fastreg pool
* @union.fmr: * @union.fmr:
* @pool: FMR pool for fast registrations * @pool: FMR pool for fast registrations
...@@ -361,6 +360,7 @@ struct ib_conn { ...@@ -361,6 +360,7 @@ struct ib_conn {
struct iser_device *device; struct iser_device *device;
struct iser_comp *comp; struct iser_comp *comp;
bool pi_support; bool pi_support;
struct completion flush_comp;
spinlock_t lock; spinlock_t lock;
union { union {
struct { struct {
...@@ -395,6 +395,7 @@ struct iser_conn { ...@@ -395,6 +395,7 @@ struct iser_conn {
u64 login_req_dma, login_resp_dma; u64 login_req_dma, login_resp_dma;
unsigned int rx_desc_head; unsigned int rx_desc_head;
struct iser_rx_desc *rx_descs; struct iser_rx_desc *rx_descs;
u32 num_rx_descs;
}; };
struct iscsi_iser_task { struct iscsi_iser_task {
......
...@@ -272,7 +272,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, ...@@ -272,7 +272,8 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
if (iser_alloc_login_buf(iser_conn)) if (iser_alloc_login_buf(iser_conn))
goto alloc_login_buf_fail; goto alloc_login_buf_fail;
iser_conn->rx_descs = kmalloc(session->cmds_max * iser_conn->num_rx_descs = session->cmds_max;
iser_conn->rx_descs = kmalloc(iser_conn->num_rx_descs *
sizeof(struct iser_rx_desc), GFP_KERNEL); sizeof(struct iser_rx_desc), GFP_KERNEL);
if (!iser_conn->rx_descs) if (!iser_conn->rx_descs)
goto rx_desc_alloc_fail; goto rx_desc_alloc_fail;
......
...@@ -39,14 +39,14 @@ ...@@ -39,14 +39,14 @@
#include "iscsi_iser.h" #include "iscsi_iser.h"
#define ISCSI_ISER_MAX_CONN 8 #define ISCSI_ISER_MAX_CONN 8
#define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) #define ISER_MAX_RX_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
#define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) #define ISER_MAX_TX_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN)
#define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN)
static int iser_cq_poll_limit = 512; static int iser_cq_poll_limit = 512;
static void iser_cq_tasklet_fn(unsigned long data); static void iser_cq_tasklet_fn(unsigned long data);
static void iser_cq_callback(struct ib_cq *cq, void *cq_context); static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
static int iser_drain_tx_cq(struct iser_comp *comp);
static void iser_cq_event_callback(struct ib_event *cause, void *context) static void iser_cq_event_callback(struct ib_event *cause, void *context)
{ {
...@@ -117,26 +117,17 @@ static int iser_create_device_ib_res(struct iser_device *device) ...@@ -117,26 +117,17 @@ static int iser_create_device_ib_res(struct iser_device *device)
struct iser_comp *comp = &device->comps[i]; struct iser_comp *comp = &device->comps[i];
comp->device = device; comp->device = device;
comp->rx_cq = ib_create_cq(device->ib_device, comp->cq = ib_create_cq(device->ib_device,
iser_cq_callback, iser_cq_callback,
iser_cq_event_callback, iser_cq_event_callback,
(void *)comp, (void *)comp,
ISER_MAX_RX_CQ_LEN, i); ISER_MAX_CQ_LEN, i);
if (IS_ERR(comp->rx_cq)) { if (IS_ERR(comp->cq)) {
comp->rx_cq = NULL; comp->cq = NULL;
goto cq_err; goto cq_err;
} }
comp->tx_cq = ib_create_cq(device->ib_device, NULL, if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
iser_cq_event_callback,
(void *)comp,
ISER_MAX_TX_CQ_LEN, i);
if (IS_ERR(comp->tx_cq)) {
comp->tx_cq = NULL;
goto cq_err;
}
if (ib_req_notify_cq(comp->rx_cq, IB_CQ_NEXT_COMP))
goto cq_err; goto cq_err;
tasklet_init(&comp->tasklet, iser_cq_tasklet_fn, tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
...@@ -165,10 +156,8 @@ static int iser_create_device_ib_res(struct iser_device *device) ...@@ -165,10 +156,8 @@ static int iser_create_device_ib_res(struct iser_device *device)
for (i = 0; i < device->comps_used; i++) { for (i = 0; i < device->comps_used; i++) {
struct iser_comp *comp = &device->comps[i]; struct iser_comp *comp = &device->comps[i];
if (comp->tx_cq) if (comp->cq)
ib_destroy_cq(comp->tx_cq); ib_destroy_cq(comp->cq);
if (comp->rx_cq)
ib_destroy_cq(comp->rx_cq);
} }
ib_dealloc_pd(device->pd); ib_dealloc_pd(device->pd);
pd_err: pd_err:
...@@ -189,10 +178,8 @@ static void iser_free_device_ib_res(struct iser_device *device) ...@@ -189,10 +178,8 @@ static void iser_free_device_ib_res(struct iser_device *device)
struct iser_comp *comp = &device->comps[i]; struct iser_comp *comp = &device->comps[i];
tasklet_kill(&comp->tasklet); tasklet_kill(&comp->tasklet);
ib_destroy_cq(comp->tx_cq); ib_destroy_cq(comp->cq);
ib_destroy_cq(comp->rx_cq); comp->cq = NULL;
comp->tx_cq = NULL;
comp->rx_cq = NULL;
} }
(void)ib_unregister_event_handler(&device->event_handler); (void)ib_unregister_event_handler(&device->event_handler);
...@@ -462,8 +449,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) ...@@ -462,8 +449,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
init_attr.event_handler = iser_qp_event_callback; init_attr.event_handler = iser_qp_event_callback;
init_attr.qp_context = (void *)ib_conn; init_attr.qp_context = (void *)ib_conn;
init_attr.send_cq = ib_conn->comp->tx_cq; init_attr.send_cq = ib_conn->comp->cq;
init_attr.recv_cq = ib_conn->comp->rx_cq; init_attr.recv_cq = ib_conn->comp->cq;
init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS;
init_attr.cap.max_send_sge = 2; init_attr.cap.max_send_sge = 2;
init_attr.cap.max_recv_sge = 1; init_attr.cap.max_recv_sge = 1;
...@@ -640,33 +627,6 @@ void iser_conn_release(struct iser_conn *iser_conn) ...@@ -640,33 +627,6 @@ void iser_conn_release(struct iser_conn *iser_conn)
kfree(iser_conn); kfree(iser_conn);
} }
/**
* iser_poll_for_flush_errors - Don't settle for less than all.
* @struct ib_conn: IB context of the connection
*
* This routine is called when the QP is in error state
* It polls the send CQ until all flush errors are consumed and
* returns when all flush errors were processed.
*/
static void iser_poll_for_flush_errors(struct ib_conn *ib_conn)
{
int count = 0;
while (ib_conn->post_recv_buf_count > 0 ||
atomic_read(&ib_conn->post_send_buf_count) > 0) {
msleep(100);
if (atomic_read(&ib_conn->post_send_buf_count) > 0)
iser_drain_tx_cq(ib_conn->comp);
count++;
/* Don't flood with prints */
if (count % 30 == 0)
iser_dbg("post_recv %d post_send %d",
ib_conn->post_recv_buf_count,
atomic_read(&ib_conn->post_send_buf_count));
}
}
/** /**
* triggers start of the disconnect procedures and wait for them to be done * triggers start of the disconnect procedures and wait for them to be done
* Called with state mutex held * Called with state mutex held
...@@ -698,7 +658,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn) ...@@ -698,7 +658,7 @@ int iser_conn_terminate(struct iser_conn *iser_conn)
iser_err("Failed to disconnect, conn: 0x%p err %d\n", iser_err("Failed to disconnect, conn: 0x%p err %d\n",
iser_conn, err); iser_conn, err);
iser_poll_for_flush_errors(ib_conn); wait_for_completion(&ib_conn->flush_comp);
} }
return 1; return 1;
...@@ -908,6 +868,7 @@ void iser_conn_init(struct iser_conn *iser_conn) ...@@ -908,6 +868,7 @@ void iser_conn_init(struct iser_conn *iser_conn)
iser_conn->state = ISER_CONN_INIT; iser_conn->state = ISER_CONN_INIT;
iser_conn->ib_conn.post_recv_buf_count = 0; iser_conn->ib_conn.post_recv_buf_count = 0;
atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0); atomic_set(&iser_conn->ib_conn.post_send_buf_count, 0);
init_completion(&iser_conn->ib_conn.flush_comp);
init_completion(&iser_conn->stop_completion); init_completion(&iser_conn->stop_completion);
init_completion(&iser_conn->ib_completion); init_completion(&iser_conn->ib_completion);
init_completion(&iser_conn->up_completion); init_completion(&iser_conn->up_completion);
...@@ -1155,9 +1116,31 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc) ...@@ -1155,9 +1116,31 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc)
return ib_ret; return ib_ret;
} }
/**
* is_iser_tx_desc - Indicate if the completion wr_id
* is a TX descriptor or not.
* @iser_conn: iser connection
* @wr_id: completion WR identifier
*
* Since we cannot rely on wc opcode in FLUSH errors
* we must work around it by checking if the wr_id address
* falls in the iser connection rx_descs buffer. If so
* it is an RX descriptor, otherwize it is a TX.
*/
static inline bool
is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
{
void *start = iser_conn->rx_descs;
int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
if (wr_id >= start && wr_id < start + len)
return false;
return true;
}
/** /**
* iser_handle_comp_error() - Handle error completion * iser_handle_comp_error() - Handle error completion
* @desc: iser TX descriptor
* @ib_conn: connection RDMA resources * @ib_conn: connection RDMA resources
* @wc: work completion * @wc: work completion
* *
...@@ -1167,8 +1150,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc) ...@@ -1167,8 +1150,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc)
* connection is failed (in case we passed bind stage). * connection is failed (in case we passed bind stage).
*/ */
static void static void
iser_handle_comp_error(struct iser_tx_desc *desc, iser_handle_comp_error(struct ib_conn *ib_conn,
struct ib_conn *ib_conn,
struct ib_wc *wc) struct ib_wc *wc)
{ {
struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
...@@ -1179,85 +1161,90 @@ iser_handle_comp_error(struct iser_tx_desc *desc, ...@@ -1179,85 +1161,90 @@ iser_handle_comp_error(struct iser_tx_desc *desc,
iscsi_conn_failure(iser_conn->iscsi_conn, iscsi_conn_failure(iser_conn->iscsi_conn,
ISCSI_ERR_CONN_FAILED); ISCSI_ERR_CONN_FAILED);
if (desc && desc->type == ISCSI_TX_DATAOUT) if (is_iser_tx_desc(iser_conn, (void *)wc->wr_id)) {
struct iser_tx_desc *desc = (struct iser_tx_desc *)wc->wr_id;
atomic_dec(&ib_conn->post_send_buf_count);
if (desc->type == ISCSI_TX_DATAOUT)
kmem_cache_free(ig.desc_cache, desc); kmem_cache_free(ig.desc_cache, desc);
} else {
ib_conn->post_recv_buf_count--;
}
} }
static int iser_drain_tx_cq(struct iser_comp *comp) /**
* iser_handle_wc - handle a single work completion
* @wc: work completion
*
* Soft-IRQ context, work completion can be either
* SEND or RECV, and can turn out successful or
* with error (or flush error).
*/
static void iser_handle_wc(struct ib_wc *wc)
{ {
struct ib_cq *cq = comp->tx_cq;
struct ib_wc wc;
struct iser_tx_desc *tx_desc;
struct ib_conn *ib_conn; struct ib_conn *ib_conn;
int completed_tx = 0; struct iser_tx_desc *tx_desc;
struct iser_rx_desc *rx_desc;
while (ib_poll_cq(cq, 1, &wc) == 1) { ib_conn = wc->qp->qp_context;
tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id; if (wc->status == IB_WC_SUCCESS) {
ib_conn = wc.qp->qp_context; if (wc->opcode == IB_WC_RECV) {
if (wc.status == IB_WC_SUCCESS) { rx_desc = (struct iser_rx_desc *)wc->wr_id;
if (wc.opcode == IB_WC_SEND) iser_rcv_completion(rx_desc, wc->byte_len,
ib_conn);
} else
if (wc->opcode == IB_WC_SEND) {
tx_desc = (struct iser_tx_desc *)wc->wr_id;
iser_snd_completion(tx_desc, ib_conn); iser_snd_completion(tx_desc, ib_conn);
else
iser_err("expected opcode %d got %d\n",
IB_WC_SEND, wc.opcode);
} else {
iser_err("tx id %llx status %d vend_err %x\n",
wc.wr_id, wc.status, wc.vendor_err);
if (wc.wr_id != ISER_FASTREG_LI_WRID) {
atomic_dec(&ib_conn->post_send_buf_count); atomic_dec(&ib_conn->post_send_buf_count);
iser_handle_comp_error(tx_desc, ib_conn, &wc); } else {
} iser_err("Unknown wc opcode %d\n", wc->opcode);
} }
completed_tx++; } else {
if (wc->status != IB_WC_WR_FLUSH_ERR)
iser_err("wr id %llx status %d vend_err %x\n",
wc->wr_id, wc->status, wc->vendor_err);
else
iser_dbg("flush error: wr id %llx\n", wc->wr_id);
if (wc->wr_id != ISER_FASTREG_LI_WRID)
iser_handle_comp_error(ib_conn, wc);
/* complete in case all flush errors were consumed */
if (ib_conn->post_recv_buf_count == 0 &&
atomic_read(&ib_conn->post_send_buf_count) == 0)
complete(&ib_conn->flush_comp);
} }
return completed_tx;
} }
/**
* iser_cq_tasklet_fn - iSER completion polling loop
* @data: iSER completion context
*
* Soft-IRQ context, polling connection CQ until
* either CQ was empty or we exausted polling budget
*/
static void iser_cq_tasklet_fn(unsigned long data) static void iser_cq_tasklet_fn(unsigned long data)
{ {
struct iser_comp *comp = (struct iser_comp *)data; struct iser_comp *comp = (struct iser_comp *)data;
struct ib_cq *cq = comp->rx_cq; struct ib_cq *cq = comp->cq;
struct ib_wc wc; struct ib_wc wc;
struct iser_rx_desc *desc; int completed = 0;
unsigned long xfer_len;
struct ib_conn *ib_conn;
int completed_tx, completed_rx = 0;
/* First do tx drain, so in a case where we have rx flushes and a successful
* tx completion we will still go through completion error handling.
*/
completed_tx = iser_drain_tx_cq(comp);
while (ib_poll_cq(cq, 1, &wc) == 1) { while (ib_poll_cq(cq, 1, &wc) == 1) {
desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; iser_handle_wc(&wc);
BUG_ON(desc == NULL);
ib_conn = wc.qp->qp_context; if (++completed >= iser_cq_poll_limit)
if (wc.status == IB_WC_SUCCESS) {
if (wc.opcode == IB_WC_RECV) {
xfer_len = (unsigned long)wc.byte_len;
iser_rcv_completion(desc, xfer_len, ib_conn);
} else
iser_err("expected opcode %d got %d\n",
IB_WC_RECV, wc.opcode);
} else {
if (wc.status != IB_WC_WR_FLUSH_ERR)
iser_err("rx id %llx status %d vend_err %x\n",
wc.wr_id, wc.status, wc.vendor_err);
ib_conn->post_recv_buf_count--;
iser_handle_comp_error(NULL, ib_conn, &wc);
}
completed_rx++;
if (!(completed_rx & 63))
completed_tx += iser_drain_tx_cq(comp);
if (completed_rx >= iser_cq_poll_limit)
break; break;
} }
/* #warning "it is assumed here that arming CQ only once its empty" *
* " would not cause interrupts to be missed" */ /*
* It is assumed here that arming CQ only once its empty
* would not cause interrupts to be missed.
*/
ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); iser_dbg("got %d completions\n", completed);
} }
static void iser_cq_callback(struct ib_cq *cq, void *cq_context) static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment