mlxsw: pci: Use page pool for Rx buffers allocation

As part of driver init, all Rx queues are filled with buffers for hardware usage. Later, when a packet is received, a new buffer should be allocated to be used by hardware instead of the received buffer. Packet's processing time includes allocation time, which can be improved using page pool. Using page pool, DMA mapping is done only for first allocation of buffers. As subsequent buffers allocation avoid DMA mapping, it results in performance improvement. The purpose of page pool is to allocate pages fast from cache without locking. This lockless guarantee naturally comes from running under a NAPI. Use page pool to allocate the data buffer only, so hardware will use it to fill the packet. At completion time, attach the data buffer (now filled with packet payload) to new SKB which is allocated around the received buffer. SKB building at completion time prevents cache miss for each packet, as now the SKB is allocated right before packets will be handled by networking stack. Page pool for each Rx queue enhances Rx side performance by reclaiming buffers back to each queue specific pool. This change significantly improves driver performance, CPU can handle about 345% of the packets per second it previously handled. Signed-off-by: Amit Cohen <amcohen@nvidia.com> Reviewed-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Link: https://lore.kernel.org/r/1cf788a8f43c70aae6d526018ef77becb27ad6d3.1718709196.git.petrm@nvidia.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

mlxsw: pci: Use page pool for Rx buffers allocation
As part of driver init, all Rx queues are filled with buffers for hardware usage. Later, when a packet is received, a new buffer should be allocated to be used by hardware instead of the received buffer. Packet's processing time includes allocation time, which can be improved using page pool. Using page pool, DMA mapping is done only for first allocation of buffers. As subsequent buffers allocation avoid DMA mapping, it results in performance improvement. The purpose of page pool is to allocate pages fast from cache without locking. This lockless guarantee naturally comes from running under a NAPI. Use page pool to allocate the data buffer only, so hardware will use it to fill the packet. At completion time, attach the data buffer (now filled with packet payload) to new SKB which is allocated around the received buffer. SKB building at completion time prevents cache miss for each packet, as now the SKB is allocated right before packets will be handled by networking stack. Page pool for each Rx queue enhances Rx side performance by reclaiming buffers back to each queue specific pool. This change significantly improves driver performance, CPU can handle about 345% of the packets per second it previously handled. Signed-off-by: Amit Cohen <amcohen@nvidia.com> Reviewed-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: Petr Machata <petrm@nvidia.com> Reviewed-by: Jiri Pirko <jiri@nvidia.com> Link: https://lore.kernel.org/r/1cf788a8f43c70aae6d526018ef77becb27ad6d3.1718709196.git.petrm@nvidia.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
b5b60bb4 · Amit Cohen · Jakub Kicinski · 5642c6a0 · b5b60bb4
Commit b5b60bb4 authored Jun 18, 2024 by Amit Cohen Committed by Jakub Kicinski Jun 19, 2024
Show whitespace changes
Inline Side-by-side

Showing with 64 additions and 39 deletions

drivers/net/ethernet/mellanox/mlxsw/pci.c drivers/net/ethernet/mellanox/mlxsw/pci.c +64 -39

No files found.
--- a/drivers/net/ethernet/mellanox/mlxsw/pci.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/pci.c
@@ -62,6 +62,7 @@ struct mlxsw_pci_mem_item {
 };

 struct mlxsw_pci_queue_elem_info {
+	struct page *page;
 	char *elem; /* pointer to actual dma mapped element mem chunk */
 	union {
 		struct {
@@ -346,6 +347,19 @@ static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
 		(MLXSW_PCI_SKB_HEADROOM +	\
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

+static void
+mlxsw_pci_wqe_rx_frag_set(struct mlxsw_pci *mlxsw_pci, struct page *page,
+			  char *wqe, int index, size_t frag_len)
+{
+	dma_addr_t mapaddr;
+
+	mapaddr = page_pool_get_dma_addr(page);
+	mapaddr += MLXSW_PCI_SKB_HEADROOM;
+
+	mlxsw_pci_wqe_address_set(wqe, index, mapaddr);
+	mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len);
+}
+
 static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
 				  int index, char *frag_data, size_t frag_len,
 				  int direction)
@@ -375,43 +389,46 @@ static void mlxsw_pci_wqe_frag_unmap(struct mlxsw_pci *mlxsw_pci, char *wqe,
 	dma_unmap_single(&pdev->dev, mapaddr, frag_len, direction);
 }

-static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
-				   struct mlxsw_pci_queue_elem_info *elem_info,
-				   gfp_t gfp)
+static struct sk_buff *mlxsw_pci_rdq_build_skb(struct page *page,
+					       u16 byte_count)
 {
+	void *data = page_address(page);
+	unsigned int allocated_size;
+	struct sk_buff *skb;
+
+	allocated_size = page_size(page);
+	skb = napi_build_skb(data, allocated_size);
+	if (unlikely(!skb))
+		return ERR_PTR(-ENOMEM);
+
+	skb_reserve(skb, MLXSW_PCI_SKB_HEADROOM);
+	skb_put(skb, byte_count);
+	return skb;
+}
+
+static int mlxsw_pci_rdq_page_alloc(struct mlxsw_pci_queue *q,
+				    struct mlxsw_pci_queue_elem_info *elem_info)
+{
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
 	size_t buf_len = MLXSW_PORT_MAX_MTU;
 	char *wqe = elem_info->elem;
-	struct sk_buff *skb;
-	int err;
+	struct page *page;

-	skb = __netdev_alloc_skb_ip_align(NULL, buf_len, gfp);
-	if (!skb)
+	page = page_pool_dev_alloc_pages(cq->u.cq.page_pool);
+	if (unlikely(!page))
 		return -ENOMEM;

-	err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
-				     buf_len, DMA_FROM_DEVICE);
-	if (err)
-		goto err_frag_map;
-
-	elem_info->u.rdq.skb = skb;
+	mlxsw_pci_wqe_rx_frag_set(q->pci, page, wqe, 0, buf_len);
+	elem_info->page = page;
 	return 0;
-
-err_frag_map:
-	dev_kfree_skb_any(skb);
-	return err;
 }

-static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci,
+static void mlxsw_pci_rdq_page_free(struct mlxsw_pci_queue *q,
 				    struct mlxsw_pci_queue_elem_info *elem_info)
 {
-	struct sk_buff *skb;
-	char *wqe;
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;

-	skb = elem_info->u.rdq.skb;
-	wqe = elem_info->elem;
-
-	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
-	dev_kfree_skb_any(skb);
+	page_pool_put_page(cq->u.cq.page_pool, elem_info->page, -1, false);
 }

 static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
@@ -452,7 +469,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 	for (i = 0; i < q->count; i++) {
 		elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
 		BUG_ON(!elem_info);
-		err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_KERNEL);
+		err = mlxsw_pci_rdq_page_alloc(q, elem_info);
 		if (err)
 			goto rollback;
 		/* Everything is set up, ring doorbell to pass elem to HW */
@@ -465,7 +482,7 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
 rollback:
 	for (i--; i >= 0; i--) {
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
-		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+		mlxsw_pci_rdq_page_free(q, elem_info);
 	}
 	q->u.rdq.cq = NULL;
 	cq->u.cq.dq = NULL;
@@ -483,7 +500,7 @@ static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
 	mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
 	for (i = 0; i < q->count; i++) {
 		elem_info = mlxsw_pci_queue_elem_info_get(q, i);
-		mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
+		mlxsw_pci_rdq_page_free(q, elem_info);
 	}
 }

@@ -618,26 +635,38 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
 {
 	struct pci_dev *pdev = mlxsw_pci->pdev;
 	struct mlxsw_pci_queue_elem_info *elem_info;
+	struct mlxsw_pci_queue *cq = q->u.rdq.cq;
 	struct mlxsw_rx_info rx_info = {};
-	char wqe[MLXSW_PCI_WQE_SIZE];
 	struct sk_buff *skb;
+	struct page *page;
 	u16 byte_count;
 	int err;

 	elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
-	skb = elem_info->u.rdq.skb;
-	memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE);

 	if (q->consumer_counter++ != consumer_counter_limit)
 		dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");

-	err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_ATOMIC);
+	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
+	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
+		byte_count -= ETH_FCS_LEN;
+
+	page = elem_info->page;
+
+	err = mlxsw_pci_rdq_page_alloc(q, elem_info);
 	if (err) {
-		dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
+		dev_err_ratelimited(&pdev->dev, "Failed to alloc page\n");
+		goto out;
+	}
+
+	skb = mlxsw_pci_rdq_build_skb(page, byte_count);
+	if (IS_ERR(skb)) {
+		dev_err_ratelimited(&pdev->dev, "Failed to build skb for RDQ\n");
+		page_pool_recycle_direct(cq->u.cq.page_pool, page);
 		goto out;
 	}

-	mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
+	skb_mark_for_recycle(skb);

 	if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
 		rx_info.is_lag = true;
@@ -670,10 +699,6 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,

 	mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe);

-	byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
-	if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
-		byte_count -= ETH_FCS_LEN;
-	skb_put(skb, byte_count);
 	mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);

 out: