IB/hfi1: Remove cache and embed txreq in ring

This patch removes kmem cache allocation and deallocation in favor of having the ipoib_txreq in the ring. The consumer is now the packet sending side allocating tx descriptors from ring and the producer is the napi interrupt handling freeing tx descriptors. The locks are now eliminated because the napi tx lock insures a single consumer and the napi handling insures a single producer. The napi poll is converted to memory poll looking for items that have been marked completed. Fixes: d99dc602 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/20210913132826.131370.4397.stgit@awfm-01.cornelisnetworks.comSigned-off-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>

IB/hfi1: Remove cache and embed txreq in ring
This patch removes kmem cache allocation and deallocation in favor of having the ipoib_txreq in the ring. The consumer is now the packet sending side allocating tx descriptors from ring and the producer is the napi interrupt handling freeing tx descriptors. The locks are now eliminated because the napi tx lock insures a single consumer and the napi handling insures a single producer. The napi poll is converted to memory poll looking for items that have been marked completed. Fixes: d99dc602 ("IB/hfi1: Add functions to transmit datagram ipoib packets") Link: https://lore.kernel.org/r/20210913132826.131370.4397.stgit@awfm-01.cornelisnetworks.comSigned-off-by: Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com> Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
d47dfc2b · Mike Marciniszyn · Jason Gunthorpe · 450f4f6a · d47dfc2b · d47dfc2b
Commit d47dfc2b authored Sep 13, 2021 by Mike Marciniszyn Committed by Jason Gunthorpe Sep 27, 2021
Show whitespace changes
Inline Side-by-side

Showing with 101 additions and 128 deletions

drivers/infiniband/hw/hfi1/ipoib.h drivers/infiniband/hw/hfi1/ipoib.h +7 -10

drivers/infiniband/hw/hfi1/ipoib_tx.c drivers/infiniband/hw/hfi1/ipoib_tx.c +94 -118

No files found.
--- a/drivers/infiniband/hw/hfi1/ipoib.h
+++ b/drivers/infiniband/hw/hfi1/ipoib.h
@@ -45,21 +45,19 @@ union hfi1_ipoib_flow {

 /**
 * struct hfi1_ipoib_circ_buf - List of items to be processed
- * @items: ring of items
+ * @items: ring of items each a power of two size
 * @head: ring head
 * @tail: ring tail
 * @max_items: max items + 1 that the ring can contain
- * @producer_lock: producer sync lock
- * @consumer_lock: consumer sync lock
+ * @shift: log2 of size for getting txreq
 */
 struct ipoib_txreq;
 struct hfi1_ipoib_circ_buf {
-	struct ipoib_txreq **items;
-	unsigned long head;
-	unsigned long tail;
-	unsigned long max_items;
-	spinlock_t producer_lock; /* head sync lock */
-	spinlock_t consumer_lock; /* tail sync lock */
+	void *items;
+	u32 head;
+	u32 tail;
+	u32 max_items;
+	u32 shift;
 };

 /**
@@ -102,7 +100,6 @@ struct hfi1_ipoib_dev_priv {
 	struct net_device   *netdev;
 	struct ib_device    *device;
 	struct hfi1_ipoib_txq *txqs;
-	struct kmem_cache *txreq_cache;
 	struct napi_struct *tx_napis;
 	u16 pkey;
 	u16 pkey_index;

--- a/drivers/infiniband/hw/hfi1/ipoib_tx.c
+++ b/drivers/infiniband/hw/hfi1/ipoib_tx.c
@@ -27,6 +27,7 @@
 * @txreq: sdma transmit request
 * @sdma_hdr: 9b ib headers
 * @sdma_status: status returned by sdma engine
+ * @complete: non-zero implies complete
 * @priv: ipoib netdev private data
 * @txq: txq on which skb was output
 * @skb: skb to send
@@ -35,6 +36,7 @@ struct ipoib_txreq {
 	struct sdma_txreq           txreq;
 	struct hfi1_sdma_header     sdma_hdr;
 	int                         sdma_status;
+	int                         complete;
 	struct hfi1_ipoib_dev_priv *priv;
 	struct hfi1_ipoib_txq      *txq;
 	struct sk_buff             *skb;
@@ -51,7 +53,13 @@ struct ipoib_txparms {
 	u8                          entropy;
 };

-static u64 hfi1_ipoib_txreqs(const u64 sent, const u64 completed)
+static struct ipoib_txreq *
+hfi1_txreq_from_idx(struct hfi1_ipoib_circ_buf *r, u32 idx)
+{
+	return (struct ipoib_txreq *)(r->items + (idx << r->shift));
+}
+
+static u32 hfi1_ipoib_txreqs(const u64 sent, const u64 completed)
 {
 	return sent - completed;
 }
@@ -139,51 +147,55 @@ static void hfi1_ipoib_free_tx(struct ipoib_txreq *tx, int budget)
 	}

 	napi_consume_skb(tx->skb, budget);
+	tx->skb = NULL;
 	sdma_txclean(priv->dd, &tx->txreq);
-	kmem_cache_free(priv->txreq_cache, tx);
 }

-static int hfi1_ipoib_drain_tx_ring(struct hfi1_ipoib_txq *txq, int budget)
+static void hfi1_ipoib_drain_tx_ring(struct hfi1_ipoib_txq *txq)
 {
 	struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring;
-	unsigned long head;
-	unsigned long tail;
-	unsigned int max_tx;
-	int work_done;
-	int tx_count;
-
-	spin_lock_bh(&tx_ring->consumer_lock);
-
-	/* Read index before reading contents at that index. */
-	head = smp_load_acquire(&tx_ring->head);
-	tail = tx_ring->tail;
-	max_tx = tx_ring->max_items;
-
-	work_done = min_t(int, CIRC_CNT(head, tail, max_tx), budget);
+	int i;
+	struct ipoib_txreq *tx;

-	for (tx_count = work_done; tx_count; tx_count--) {
-		hfi1_ipoib_free_tx(tx_ring->items[tail], budget);
-		tail = CIRC_NEXT(tail, max_tx);
+	for (i = 0; i < tx_ring->max_items; i++) {
+		tx = hfi1_txreq_from_idx(tx_ring, i);
+		tx->complete = 0;
+		dev_kfree_skb_any(tx->skb);
+		tx->skb = NULL;
+		sdma_txclean(txq->priv->dd, &tx->txreq);
 	}
-
-	atomic64_add(work_done, &txq->complete_txreqs);
-
-	/* Finished freeing tx items so store the tail value. */
-	smp_store_release(&tx_ring->tail, tail);
-
-	spin_unlock_bh(&tx_ring->consumer_lock);
-
-	hfi1_ipoib_check_queue_stopped(txq);
-
-	return work_done;
+	tx_ring->head = 0;
+	tx_ring->tail = 0;
+	atomic64_set(&txq->complete_txreqs, 0);
+	txq->sent_txreqs = 0;
 }

-static int hfi1_ipoib_process_tx_ring(struct napi_struct *napi, int budget)
+static int hfi1_ipoib_poll_tx_ring(struct napi_struct *napi, int budget)
 {
 	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(napi->dev);
 	struct hfi1_ipoib_txq *txq = &priv->txqs[napi - priv->tx_napis];
+	struct hfi1_ipoib_circ_buf *tx_ring = &txq->tx_ring;
+	u32 head = tx_ring->head;
+	u32 max_tx = tx_ring->max_items;
+	int work_done;
+	struct ipoib_txreq *tx =  hfi1_txreq_from_idx(tx_ring, head);
+
+	trace_hfi1_txq_poll(txq);
+	for (work_done = 0; work_done < budget; work_done++) {
+		/* See hfi1_ipoib_sdma_complete() */
+		if (!smp_load_acquire(&tx->complete))
+			break;
+		tx->complete = 0;
+		hfi1_ipoib_free_tx(tx, budget);
+		head = CIRC_NEXT(head, max_tx);
+		tx =  hfi1_txreq_from_idx(tx_ring, head);
+	}
+	atomic64_add(work_done, &txq->complete_txreqs);

-	int work_done = hfi1_ipoib_drain_tx_ring(txq, budget);
+	/* Finished freeing tx items so store the head value. */
+	smp_store_release(&tx_ring->head, head);
+
+	hfi1_ipoib_check_queue_stopped(txq);

 	if (work_done < budget)
 		napi_complete_done(napi, work_done);
@@ -191,45 +203,15 @@ static int hfi1_ipoib_process_tx_ring(struct napi_struct *napi, int budget)
 	return work_done;
 }

-static void hfi1_ipoib_add_tx(struct ipoib_txreq *tx)
-{
-	struct hfi1_ipoib_circ_buf *tx_ring = &tx->txq->tx_ring;
-	unsigned long head;
-	unsigned long tail;
-	size_t max_tx;
-
-	spin_lock(&tx_ring->producer_lock);
-
-	head = tx_ring->head;
-	tail = READ_ONCE(tx_ring->tail);
-	max_tx = tx_ring->max_items;
-
-	if (likely(CIRC_SPACE(head, tail, max_tx))) {
-		tx_ring->items[head] = tx;
-
-		/* Finish storing txreq before incrementing head. */
-		smp_store_release(&tx_ring->head, CIRC_ADD(head, 1, max_tx));
-		napi_schedule_irqoff(tx->txq->napi);
-	} else {
-		struct hfi1_ipoib_txq *txq = tx->txq;
-		struct hfi1_ipoib_dev_priv *priv = tx->priv;
-
-		/* Ring was full */
-		hfi1_ipoib_free_tx(tx, 0);
-		atomic64_inc(&txq->complete_txreqs);
-		dd_dev_dbg(priv->dd, "txq %d full.\n", txq->q_idx);
-	}
-
-	spin_unlock(&tx_ring->producer_lock);
-}
-
 static void hfi1_ipoib_sdma_complete(struct sdma_txreq *txreq, int status)
 {
 	struct ipoib_txreq *tx = container_of(txreq, struct ipoib_txreq, txreq);

+	trace_hfi1_txq_complete(tx->txq);
 	tx->sdma_status = status;
-
-	hfi1_ipoib_add_tx(tx);
+	/* see hfi1_ipoib_poll_tx_ring */
+	smp_store_release(&tx->complete, 1);
+	napi_schedule_irqoff(tx->txq->napi);
 }

 static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx,
@@ -385,19 +367,24 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev,
 						      struct ipoib_txparms *txp)
 {
 	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
+	struct hfi1_ipoib_txq *txq = txp->txq;
 	struct ipoib_txreq *tx;
+	struct hfi1_ipoib_circ_buf *tx_ring;
+	u32 tail;
 	int ret;

-	tx = kmem_cache_alloc_node(priv->txreq_cache,
-				   GFP_ATOMIC,
-				   priv->dd->node);
-	if (unlikely(!tx))
+	if (unlikely(hfi1_ipoib_used(txq) >= hfi1_ipoib_ring_hwat(txq)))
+		/* This shouldn't happen with a stopped queue */
 		return ERR_PTR(-ENOMEM);
+	tx_ring = &txq->tx_ring;
+	tail = tx_ring->tail;
+	tx = hfi1_txreq_from_idx(tx_ring, tx_ring->tail);
+	trace_hfi1_txq_alloc_tx(txq);

 	/* so that we can test if the sdma descriptors are there */
 	tx->txreq.num_desc = 0;
 	tx->priv = priv;
-	tx->txq = txp->txq;
+	tx->txq = txq;
 	tx->skb = skb;
 	INIT_LIST_HEAD(&tx->txreq.list);

@@ -405,21 +392,20 @@ static struct ipoib_txreq *hfi1_ipoib_send_dma_common(struct net_device *dev,

 	ret = hfi1_ipoib_build_tx_desc(tx, txp);
 	if (likely(!ret)) {
-		if (txp->txq->flow.as_int != txp->flow.as_int) {
-			txp->txq->flow.tx_queue = txp->flow.tx_queue;
-			txp->txq->flow.sc5 = txp->flow.sc5;
-			txp->txq->sde =
+		if (txq->flow.as_int != txp->flow.as_int) {
+			txq->flow.tx_queue = txp->flow.tx_queue;
+			txq->flow.sc5 = txp->flow.sc5;
+			txq->sde =
 				sdma_select_engine_sc(priv->dd,
 						      txp->flow.tx_queue,
 						      txp->flow.sc5);
-			trace_hfi1_flow_switch(txp->txq);
+			trace_hfi1_flow_switch(txq);
 		}

 		return tx;
 	}

 	sdma_txclean(priv->dd, &tx->txreq);
-	kmem_cache_free(priv->txreq_cache, tx);

 	return ERR_PTR(ret);
 }
@@ -480,8 +466,8 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev,
 				      struct sk_buff *skb,
 				      struct ipoib_txparms *txp)
 {
-	struct hfi1_ipoib_dev_priv *priv = hfi1_ipoib_priv(dev);
 	struct hfi1_ipoib_txq *txq = txp->txq;
+	struct hfi1_ipoib_circ_buf *tx_ring;
 	struct ipoib_txreq *tx;
 	int ret;

@@ -499,6 +485,9 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev,
 		return NETDEV_TX_OK;
 	}

+	tx_ring = &txq->tx_ring;
+	/* consume tx */
+	smp_store_release(&tx_ring->tail, CIRC_NEXT(tx_ring->tail, tx_ring->max_items));
 	ret = hfi1_ipoib_submit_tx(txq, tx);
 	if (likely(!ret)) {
 tx_ok:
@@ -514,9 +503,10 @@ static int hfi1_ipoib_send_dma_single(struct net_device *dev,
 	if (ret == -EBUSY || ret == -ECOMM)
 		goto tx_ok;

-	sdma_txclean(priv->dd, &tx->txreq);
-	dev_kfree_skb_any(skb);
-	kmem_cache_free(priv->txreq_cache, tx);
+	/* mark complete and kick napi tx */
+	smp_store_release(&tx->complete, 1);
+	napi_schedule(tx->txq->napi);
+
 	++dev->stats.tx_carrier_errors;

 	return NETDEV_TX_OK;
@@ -527,6 +517,7 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev,
 				    struct ipoib_txparms *txp)
 {
 	struct hfi1_ipoib_txq *txq = txp->txq;
+	struct hfi1_ipoib_circ_buf *tx_ring;
 	struct ipoib_txreq *tx;

 	/* Has the flow change ? */
@@ -556,6 +547,9 @@ static int hfi1_ipoib_send_dma_list(struct net_device *dev,
 		return NETDEV_TX_OK;
 	}

+	tx_ring = &txq->tx_ring;
+	/* consume tx */
+	smp_store_release(&tx_ring->tail, CIRC_NEXT(tx_ring->tail, tx_ring->max_items));
 	list_add_tail(&tx->txreq.list, &txq->tx_list);

 	hfi1_ipoib_check_queue_depth(txq);
@@ -696,31 +690,22 @@ static void hfi1_ipoib_flush_txq(struct work_struct *work)
 int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
 {
 	struct net_device *dev = priv->netdev;
-	char buf[HFI1_IPOIB_TXREQ_NAME_LEN];
-	unsigned long tx_ring_size;
+	u32 tx_ring_size, tx_item_size;
 	int i;

-	/*
-	 * Ring holds 1 less than tx_ring_size
-	 * Round up to next power of 2 in order to hold at least tx_queue_len
-	 */
-	tx_ring_size = roundup_pow_of_two((unsigned long)dev->tx_queue_len + 1);
-
-	snprintf(buf, sizeof(buf), "hfi1_%u_ipoib_txreq_cache", priv->dd->unit);
-	priv->txreq_cache = kmem_cache_create(buf,
-					      sizeof(struct ipoib_txreq),
-					      0,
-					      0,
-					      NULL);
-	if (!priv->txreq_cache)
-		return -ENOMEM;
-
 	priv->tx_napis = kcalloc_node(dev->num_tx_queues,
 				      sizeof(struct napi_struct),
 				      GFP_KERNEL,
 				      priv->dd->node);
 	if (!priv->tx_napis)
-		goto free_txreq_cache;
+		return -ENOMEM;
+
+	/*
+	 * Ring holds 1 less than tx_ring_size
+	 * Round up to next power of 2 in order to hold at least tx_queue_len
+	 */
+	tx_ring_size = roundup_pow_of_two(dev->tx_queue_len + 1);
+	tx_item_size = roundup_pow_of_two(sizeof(struct ipoib_txreq));

 	priv->txqs = kcalloc_node(dev->num_tx_queues,
 				  sizeof(struct hfi1_ipoib_txq),
@@ -756,19 +741,17 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
 					     priv->dd->node);

 		txq->tx_ring.items =
-			kcalloc_node(tx_ring_size,
-				     sizeof(struct ipoib_txreq *),
+			kcalloc_node(tx_ring_size, tx_item_size,
 				     GFP_KERNEL, priv->dd->node);
 		if (!txq->tx_ring.items)
 			goto free_txqs;

-		spin_lock_init(&txq->tx_ring.producer_lock);
-		spin_lock_init(&txq->tx_ring.consumer_lock);
 		txq->tx_ring.max_items = tx_ring_size;
+		txq->tx_ring.shift = ilog2(tx_ring_size);

 		txq->napi = &priv->tx_napis[i];
 		netif_tx_napi_add(dev, txq->napi,
-				  hfi1_ipoib_process_tx_ring,
+				  hfi1_ipoib_poll_tx_ring,
 				  NAPI_POLL_WEIGHT);
 	}

@@ -788,10 +771,6 @@ int hfi1_ipoib_txreq_init(struct hfi1_ipoib_dev_priv *priv)
 free_tx_napis:
 	kfree(priv->tx_napis);
 	priv->tx_napis = NULL;
-
-free_txreq_cache:
-	kmem_cache_destroy(priv->txreq_cache);
-	priv->txreq_cache = NULL;
 	return -ENOMEM;
 }

@@ -808,13 +787,13 @@ static void hfi1_ipoib_drain_tx_list(struct hfi1_ipoib_txq *txq)
 		list_del(&txreq->list);
 		sdma_txclean(txq->priv->dd, &tx->txreq);
 		dev_kfree_skb_any(tx->skb);
-		kmem_cache_free(txq->priv->txreq_cache, tx);
+		tx->skb = NULL;
 		atomic64_inc(complete_txreqs);
 	}

 	if (hfi1_ipoib_used(txq))
 		dd_dev_warn(txq->priv->dd,
-			    "txq %d not empty found %llu requests\n",
+			    "txq %d not empty found %u requests\n",
 			    txq->q_idx,
 			    hfi1_ipoib_txreqs(txq->sent_txreqs,
 					      atomic64_read(complete_txreqs)));
@@ -831,7 +810,7 @@ void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv)
 		iowait_sdma_drain(&txq->wait);
 		hfi1_ipoib_drain_tx_list(txq);
 		netif_napi_del(txq->napi);
-		(void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items);
+		hfi1_ipoib_drain_tx_ring(txq);
 		kfree(txq->tx_ring.items);
 	}

@@ -840,9 +819,6 @@ void hfi1_ipoib_txreq_deinit(struct hfi1_ipoib_dev_priv *priv)

 	kfree(priv->tx_napis);
 	priv->tx_napis = NULL;
-
-	kmem_cache_destroy(priv->txreq_cache);
-	priv->txreq_cache = NULL;
 }

 void hfi1_ipoib_napi_tx_enable(struct net_device *dev)
@@ -866,7 +842,7 @@ void hfi1_ipoib_napi_tx_disable(struct net_device *dev)
 		struct hfi1_ipoib_txq *txq = &priv->txqs[i];

 		napi_disable(txq->napi);
-		(void)hfi1_ipoib_drain_tx_ring(txq, txq->tx_ring.max_items);
+		hfi1_ipoib_drain_tx_ring(txq);
 	}
 }

@@ -888,9 +864,9 @@ void hfi1_ipoib_tx_timeout(struct net_device *dev, unsigned int q)
 	dd_dev_info(priv->dd, "flow %x\n", txq->flow.as_int);
 	dd_dev_info(priv->dd, "sent %llu completed %llu used %llu\n",
 		    txq->sent_txreqs, completed, hfi1_ipoib_used(txq));
-	dd_dev_info(priv->dd, "tx_queue_len %u max_items %lu\n",
+	dd_dev_info(priv->dd, "tx_queue_len %u max_items %u\n",
 		    dev->tx_queue_len, txq->tx_ring.max_items);
-	dd_dev_info(priv->dd, "head %lu tail %lu\n",
+	dd_dev_info(priv->dd, "head %u tail %u\n",
 		    txq->tx_ring.head, txq->tx_ring.tail);
 	dd_dev_info(priv->dd, "wait queued %u\n",
 		    !list_empty(&txq->wait.list));