IB/iser: Fix RNR errors

Some users complain about RNR errors on the target, when heavy high-priority tasks run on the initiator. After the investigation, we found out that the receive WRs were exhausted, because the initiator could not post them on time. Receive work reqeusts are posted in chunks to reduce the number of hits to the HCA. The WRs are posted in the receive completion handler when the number of free receive buffers reaches the threshold. But on a high-loaded host, receive CQEs processing can be delayed and all receive WRs will be exhausted. In this case, the target will get an RNR error. To avoid this, we post receive WR, as soon as possible and not in a batch. This increases the number of hits to the HCA, but also the common implementation in most of Linux ULPs (e.g. NVMe-oF/RDMA). As a rule of thumb, performance improvements and heuristics are being added to the RDMA core layer or vendors low level drivers and it's about time to align iSER as well. Link: https://lore.kernel.org/r/20211215135721.3662-3-mgurtovoy@nvidia.comSigned-off-by: Sergey Gorenko <sergeygo@nvidia.com> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Israel Rukshin <israelr@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

IB/iser: Fix RNR errors
Some users complain about RNR errors on the target, when heavy high-priority tasks run on the initiator. After the investigation, we found out that the receive WRs were exhausted, because the initiator could not post them on time. Receive work reqeusts are posted in chunks to reduce the number of hits to the HCA. The WRs are posted in the receive completion handler when the number of free receive buffers reaches the threshold. But on a high-loaded host, receive CQEs processing can be delayed and all receive WRs will be exhausted. In this case, the target will get an RNR error. To avoid this, we post receive WR, as soon as possible and not in a batch. This increases the number of hits to the HCA, but also the common implementation in most of Linux ULPs (e.g. NVMe-oF/RDMA). As a rule of thumb, performance improvements and heuristics are being added to the RDMA core layer or vendors low level drivers and it's about time to align iSER as well. Link: https://lore.kernel.org/r/20211215135721.3662-3-mgurtovoy@nvidia.comSigned-off-by: Sergey Gorenko <sergeygo@nvidia.com> Signed-off-by: Max Gurtovoy <mgurtovoy@nvidia.com> Reviewed-by: Israel Rukshin <israelr@nvidia.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
39b169ea · Sergey Gorenko · Jason Gunthorpe · b28801a0 · 39b169ea · 39b169ea
Commit 39b169ea authored Dec 15, 2021 by Sergey Gorenko Committed by Jason Gunthorpe Jan 05, 2022
3 changed files
--- a/drivers/infiniband/ulp/iser/iscsi_iser.h
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -119,8 +119,6 @@

 #define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX)

-#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
-
 /* the max TX (send) WR supported by the iSER QP is defined by                 *
 * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
 * to have at max for SCSI command. The tx posting & completion handling code  *
@@ -366,9 +364,7 @@ struct iser_fr_pool {
 * @qp:                  Connection Queue-pair
 * @cq:                  Connection completion queue
 * @cq_size:             The number of max outstanding completions
- * @post_recv_buf_count: post receive counter
 * @sig_count:           send work request signal count
- * @rx_wr:               receive work request for batch posts
 * @device:              reference to iser device
 * @fr_pool:             connection fast registration poool
 * @pi_support:          Indicate device T10-PI support
@@ -379,9 +375,7 @@ struct ib_conn {
 	struct ib_qp	            *qp;
 	struct ib_cq		    *cq;
 	u32			    cq_size;
-	int                          post_recv_buf_count;
 	u8                           sig_count;
-	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
 	struct iser_device          *device;
 	struct iser_fr_pool          fr_pool;
 	bool			     pi_support;
@@ -397,8 +391,6 @@ struct ib_conn {
 * @state:            connection logical state
 * @qp_max_recv_dtos: maximum number of data outs, corresponds
 *                    to max number of post recvs
- * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
- * @min_posted_rx:    (qp_max_recv_dtos >> 2)
 * @max_cmds:         maximum cmds allowed for this connection
 * @name:             connection peer portal
 * @release_work:     deffered work for release job
@@ -409,7 +401,6 @@ struct ib_conn {
 *                    (state is ISER_CONN_UP)
 * @conn_list:        entry in ig conn list
 * @login_desc:       login descriptor
- * @rx_desc_head:     head of rx_descs cyclic buffer
 * @rx_descs:         rx buffers array (cyclic buffer)
 * @num_rx_descs:     number of rx descriptors
 * @scsi_sg_tablesize: scsi host sg_tablesize
@@ -422,8 +413,6 @@ struct iser_conn {
 	struct iscsi_endpoint	     *ep;
 	enum iser_conn_state	     state;
 	unsigned		     qp_max_recv_dtos;
-	unsigned		     qp_max_recv_dtos_mask;
-	unsigned		     min_posted_rx;
 	u16                          max_cmds;
 	char 			     name[ISER_OBJECT_NAME_SIZE];
 	struct work_struct	     release_work;
@@ -433,7 +422,6 @@ struct iser_conn {
 	struct completion	     up_completion;
 	struct list_head	     conn_list;
 	struct iser_login_desc       login_desc;
-	unsigned int 		     rx_desc_head;
 	struct iser_rx_desc	     *rx_descs;
 	u32                          num_rx_descs;
 	unsigned short               scsi_sg_tablesize;
@@ -542,7 +530,8 @@ int  iser_connect(struct iser_conn *iser_conn,
 		  int non_blocking);

 int  iser_post_recvl(struct iser_conn *iser_conn);
-int  iser_post_recvm(struct iser_conn *iser_conn, int count);
+int  iser_post_recvm(struct iser_conn *iser_conn,
+		     struct iser_rx_desc *rx_desc);
 int  iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
 		    bool signal);


--- a/drivers/infiniband/ulp/iser/iser_initiator.c
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 	struct iser_device *device = ib_conn->device;

 	iser_conn->qp_max_recv_dtos = session->cmds_max;
-	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
-	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;

 	if (iser_alloc_fastreg_pool(ib_conn, session->scsi_cmds_max,
 				    iser_conn->pages_per_mr))
@@ -280,7 +278,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
 		rx_sg->lkey = device->pd->local_dma_lkey;
 	}

-	iser_conn->rx_desc_head = 0;
 	return 0;

 rx_desc_dma_map_failed:
@@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
 static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
 {
 	struct iser_conn *iser_conn = conn->dd_data;
-	struct ib_conn *ib_conn = &iser_conn->ib_conn;
 	struct iscsi_session *session = conn->session;
+	int err = 0;
+	int i;

 	iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
 	/* check if this is the last login - going to full feature phase */
 	if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
-		return 0;
-
-	/*
-	 * Check that there is one posted recv buffer
-	 * (for the last login response).
-	 */
-	WARN_ON(ib_conn->post_recv_buf_count != 1);
+		goto out;

 	if (session->discovery_sess) {
 		iser_info("Discovery session, re-using login RX buffer\n");
-		return 0;
-	} else
-		iser_info("Normal session, posting batch of RX %d buffers\n",
-			  iser_conn->min_posted_rx);
+		goto out;
+	}

-	/* Initial post receive buffers */
-	if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
-		return -ENOMEM;
+	iser_info("Normal session, posting batch of RX %d buffers\n",
+		  iser_conn->qp_max_recv_dtos - 1);

-	return 0;
+	/*
+	 * Initial post receive buffers.
+	 * There is one already posted recv buffer (for the last login
+	 * response). Therefore, the first recv buffer is skipped here.
+	 */
+	for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) {
+		err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]);
+		if (err)
+			goto out;
+	}
+out:
+	return err;
 }

 static inline bool iser_signal_comp(u8 sig_count)
@@ -590,7 +590,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
 				      desc->rsp_dma, ISER_RX_LOGIN_SIZE,
 				      DMA_FROM_DEVICE);

-	ib_conn->post_recv_buf_count--;
+	if (iser_conn->iscsi_conn->session->discovery_sess)
+		return;
+
+	/* Post the first RX buffer that is skipped in iser_post_rx_bufs() */
+	iser_post_recvm(iser_conn, iser_conn->rx_descs);
 }

 static inline int
@@ -657,8 +661,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
 	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
 	struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
 	struct iscsi_hdr *hdr;
-	int length;
-	int outstanding, count, err;
+	int length, err;

 	if (unlikely(wc->status != IB_WC_SUCCESS)) {
 		iser_err_comp(wc, "task_rsp");
@@ -687,20 +690,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
 				      desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
 				      DMA_FROM_DEVICE);

-	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
-	 * task eliminates the need to worry on tasks which are completed in   *
-	 * parallel to the execution of iser_conn_term. So the code that waits *
-	 * for the posted rx bufs refcount to become zero handles everything   */
-	ib_conn->post_recv_buf_count--;
-
-	outstanding = ib_conn->post_recv_buf_count;
-	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
-		count = min(iser_conn->qp_max_recv_dtos - outstanding,
-			    iser_conn->min_posted_rx);
-		err = iser_post_recvm(iser_conn, count);
-		if (err)
-			iser_err("posting %d rx bufs err %d\n", count, err);
-	}
+	err = iser_post_recvm(iser_conn, desc);
+	if (err)
+		iser_err("posting rx buffer err %d\n", err);
 }

 void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)

--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -757,7 +757,6 @@ void iser_conn_init(struct iser_conn *iser_conn)
 	INIT_LIST_HEAD(&iser_conn->conn_list);
 	mutex_init(&iser_conn->state_mutex);

-	ib_conn->post_recv_buf_count = 0;
 	ib_conn->reg_cqe.done = iser_reg_comp;
 }

@@ -841,44 +840,28 @@ int iser_post_recvl(struct iser_conn *iser_conn)
 	wr.num_sge = 1;
 	wr.next = NULL;

-	ib_conn->post_recv_buf_count++;
 	ib_ret = ib_post_recv(ib_conn->qp, &wr, NULL);
-	if (ib_ret) {
-		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
-		ib_conn->post_recv_buf_count--;
-	}
+	if (unlikely(ib_ret))
+		iser_err("ib_post_recv login failed ret=%d\n", ib_ret);

 	return ib_ret;
 }

-int iser_post_recvm(struct iser_conn *iser_conn, int count)
+int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc)
 {
 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
-	unsigned int my_rx_head = iser_conn->rx_desc_head;
-	struct iser_rx_desc *rx_desc;
-	struct ib_recv_wr *wr;
-	int i, ib_ret;
-
-	for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
-		rx_desc = &iser_conn->rx_descs[my_rx_head];
-		rx_desc->cqe.done = iser_task_rsp;
-		wr->wr_cqe = &rx_desc->cqe;
-		wr->sg_list = &rx_desc->rx_sg;
-		wr->num_sge = 1;
-		wr->next = wr + 1;
-		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
-	}
+	struct ib_recv_wr wr;
+	int ib_ret;

-	wr--;
-	wr->next = NULL; /* mark end of work requests list */
+	rx_desc->cqe.done = iser_task_rsp;
+	wr.wr_cqe = &rx_desc->cqe;
+	wr.sg_list = &rx_desc->rx_sg;
+	wr.num_sge = 1;
+	wr.next = NULL;

-	ib_conn->post_recv_buf_count += count;
-	ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, NULL);
-	if (unlikely(ib_ret)) {
+	ib_ret = ib_post_recv(ib_conn->qp, &wr, NULL);
+	if (unlikely(ib_ret))
 		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
-		ib_conn->post_recv_buf_count -= count;
-	} else
-		iser_conn->rx_desc_head = my_rx_head;

 	return ib_ret;
 }