Commit e52fcb24 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

bnx2x: uses build_skb() in receive path

bnx2x uses following formula to compute its rx_buf_sz :

dev->mtu + 2*L1_CACHE_BYTES + 14 + 8 + 8 + 2

Then core network adds NET_SKB_PAD and SKB_DATA_ALIGN(sizeof(struct
skb_shared_info))

Final allocated size for skb head on x86_64 (L1_CACHE_BYTES = 64,
MTU=1500) : 2112 bytes : SLUB/SLAB round this to 4096 bytes.

Since skb truesize is then bigger than SK_MEM_QUANTUM, we have lot of
false sharing because of mem_reclaim in UDP stack.

One possible way to half truesize is to reduce the need by 64 bytes
(2112 -> 2048 bytes)

Instead of allocating a full cache line at the end of packet for
alignment, we can use the fact that skb_shared_info sits at the end of
skb->head, and we can use this room, if we convert bnx2x to new
build_skb() infrastructure.

skb_shared_info will be initialized after hardware finished its
transfert, so we can eventually overwrite the final padding.

Using build_skb() also reduces cache line misses in the driver, since we
use cache hot skb instead of cold ones. Number of in-flight sk_buff
structures is lower, they are recycled while still hot.

Performance results :

(820.000 pps on a rx UDP monothread benchmark, instead of 720.000 pps)
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
CC: Eilon Greenstein <eilong@broadcom.com>
CC: Ben Hutchings <bhutchings@solarflare.com>
CC: Tom Herbert <therbert@google.com>
CC: Jamal Hadi Salim <hadi@mojatatu.com>
CC: Stephen Hemminger <shemminger@vyatta.com>
CC: Thomas Graf <tgraf@infradead.org>
CC: Herbert Xu <herbert@gondor.apana.org.au>
CC: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
Acked-by: default avatarEilon Greenstein <eilong@broadcom.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent b2b5ce9d
...@@ -293,8 +293,13 @@ enum { ...@@ -293,8 +293,13 @@ enum {
#define FCOE_TXQ_IDX(bp) (MAX_ETH_TXQ_IDX(bp)) #define FCOE_TXQ_IDX(bp) (MAX_ETH_TXQ_IDX(bp))
/* fast path */ /* fast path */
/*
* This driver uses new build_skb() API :
* RX ring buffer contains pointer to kmalloc() data only,
* skb are built only after Hardware filled the frame.
*/
struct sw_rx_bd { struct sw_rx_bd {
struct sk_buff *skb; u8 *data;
DEFINE_DMA_UNMAP_ADDR(mapping); DEFINE_DMA_UNMAP_ADDR(mapping);
}; };
...@@ -424,8 +429,8 @@ union host_hc_status_block { ...@@ -424,8 +429,8 @@ union host_hc_status_block {
struct bnx2x_agg_info { struct bnx2x_agg_info {
/* /*
* First aggregation buffer is an skb, the following - are pages. * First aggregation buffer is a data buffer, the following - are pages.
* We will preallocate the skbs for each aggregation when * We will preallocate the data buffer for each aggregation when
* we open the interface and will replace the BD at the consumer * we open the interface and will replace the BD at the consumer
* with this one when we receive the TPA_START CQE in order to * with this one when we receive the TPA_START CQE in order to
* keep the Rx BD ring consistent. * keep the Rx BD ring consistent.
...@@ -439,6 +444,7 @@ struct bnx2x_agg_info { ...@@ -439,6 +444,7 @@ struct bnx2x_agg_info {
u16 parsing_flags; u16 parsing_flags;
u16 vlan_tag; u16 vlan_tag;
u16 len_on_bd; u16 len_on_bd;
u32 rxhash;
}; };
#define Q_STATS_OFFSET32(stat_name) \ #define Q_STATS_OFFSET32(stat_name) \
...@@ -1187,10 +1193,20 @@ struct bnx2x { ...@@ -1187,10 +1193,20 @@ struct bnx2x {
#define ETH_MAX_JUMBO_PACKET_SIZE 9600 #define ETH_MAX_JUMBO_PACKET_SIZE 9600
/* Max supported alignment is 256 (8 shift) */ /* Max supported alignment is 256 (8 shift) */
#define BNX2X_RX_ALIGN_SHIFT ((L1_CACHE_SHIFT < 8) ? \ #define BNX2X_RX_ALIGN_SHIFT min(8, L1_CACHE_SHIFT)
L1_CACHE_SHIFT : 8)
/* FW use 2 Cache lines Alignment for start packet and size */ /* FW uses 2 Cache lines Alignment for start packet and size
#define BNX2X_FW_RX_ALIGN (2 << BNX2X_RX_ALIGN_SHIFT) *
* We assume skb_build() uses sizeof(struct skb_shared_info) bytes
* at the end of skb->data, to avoid wasting a full cache line.
* This reduces memory use (skb->truesize).
*/
#define BNX2X_FW_RX_ALIGN_START (1UL << BNX2X_RX_ALIGN_SHIFT)
#define BNX2X_FW_RX_ALIGN_END \
max(1UL << BNX2X_RX_ALIGN_SHIFT, \
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
#define BNX2X_PXP_DRAM_ALIGN (BNX2X_RX_ALIGN_SHIFT - 5) #define BNX2X_PXP_DRAM_ALIGN (BNX2X_RX_ALIGN_SHIFT - 5)
struct host_sp_status_block *def_status_blk; struct host_sp_status_block *def_status_blk;
......
...@@ -910,26 +910,27 @@ static inline int bnx2x_alloc_rx_sge(struct bnx2x *bp, ...@@ -910,26 +910,27 @@ static inline int bnx2x_alloc_rx_sge(struct bnx2x *bp,
return 0; return 0;
} }
static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp, static inline int bnx2x_alloc_rx_data(struct bnx2x *bp,
struct bnx2x_fastpath *fp, u16 index) struct bnx2x_fastpath *fp, u16 index)
{ {
struct sk_buff *skb; u8 *data;
struct sw_rx_bd *rx_buf = &fp->rx_buf_ring[index]; struct sw_rx_bd *rx_buf = &fp->rx_buf_ring[index];
struct eth_rx_bd *rx_bd = &fp->rx_desc_ring[index]; struct eth_rx_bd *rx_bd = &fp->rx_desc_ring[index];
dma_addr_t mapping; dma_addr_t mapping;
skb = netdev_alloc_skb(bp->dev, fp->rx_buf_size); data = kmalloc(fp->rx_buf_size + NET_SKB_PAD, GFP_ATOMIC);
if (unlikely(skb == NULL)) if (unlikely(data == NULL))
return -ENOMEM; return -ENOMEM;
mapping = dma_map_single(&bp->pdev->dev, skb->data, fp->rx_buf_size, mapping = dma_map_single(&bp->pdev->dev, data + NET_SKB_PAD,
fp->rx_buf_size,
DMA_FROM_DEVICE); DMA_FROM_DEVICE);
if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) { if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
dev_kfree_skb_any(skb); kfree(data);
return -ENOMEM; return -ENOMEM;
} }
rx_buf->skb = skb; rx_buf->data = data;
dma_unmap_addr_set(rx_buf, mapping, mapping); dma_unmap_addr_set(rx_buf, mapping, mapping);
rx_bd->addr_hi = cpu_to_le32(U64_HI(mapping)); rx_bd->addr_hi = cpu_to_le32(U64_HI(mapping));
...@@ -938,12 +939,12 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp, ...@@ -938,12 +939,12 @@ static inline int bnx2x_alloc_rx_skb(struct bnx2x *bp,
return 0; return 0;
} }
/* note that we are not allocating a new skb, /* note that we are not allocating a new buffer,
* we are just moving one from cons to prod * we are just moving one from cons to prod
* we are not creating a new mapping, * we are not creating a new mapping,
* so there is no need to check for dma_mapping_error(). * so there is no need to check for dma_mapping_error().
*/ */
static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp, static inline void bnx2x_reuse_rx_data(struct bnx2x_fastpath *fp,
u16 cons, u16 prod) u16 cons, u16 prod)
{ {
struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons]; struct sw_rx_bd *cons_rx_buf = &fp->rx_buf_ring[cons];
...@@ -953,7 +954,7 @@ static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp, ...@@ -953,7 +954,7 @@ static inline void bnx2x_reuse_rx_skb(struct bnx2x_fastpath *fp,
dma_unmap_addr_set(prod_rx_buf, mapping, dma_unmap_addr_set(prod_rx_buf, mapping,
dma_unmap_addr(cons_rx_buf, mapping)); dma_unmap_addr(cons_rx_buf, mapping));
prod_rx_buf->skb = cons_rx_buf->skb; prod_rx_buf->data = cons_rx_buf->data;
*prod_bd = *cons_bd; *prod_bd = *cons_bd;
} }
...@@ -1029,9 +1030,9 @@ static inline void bnx2x_free_tpa_pool(struct bnx2x *bp, ...@@ -1029,9 +1030,9 @@ static inline void bnx2x_free_tpa_pool(struct bnx2x *bp,
for (i = 0; i < last; i++) { for (i = 0; i < last; i++) {
struct bnx2x_agg_info *tpa_info = &fp->tpa_info[i]; struct bnx2x_agg_info *tpa_info = &fp->tpa_info[i];
struct sw_rx_bd *first_buf = &tpa_info->first_buf; struct sw_rx_bd *first_buf = &tpa_info->first_buf;
struct sk_buff *skb = first_buf->skb; u8 *data = first_buf->data;
if (skb == NULL) { if (data == NULL) {
DP(NETIF_MSG_IFDOWN, "tpa bin %d empty on free\n", i); DP(NETIF_MSG_IFDOWN, "tpa bin %d empty on free\n", i);
continue; continue;
} }
...@@ -1039,8 +1040,8 @@ static inline void bnx2x_free_tpa_pool(struct bnx2x *bp, ...@@ -1039,8 +1040,8 @@ static inline void bnx2x_free_tpa_pool(struct bnx2x *bp,
dma_unmap_single(&bp->pdev->dev, dma_unmap_single(&bp->pdev->dev,
dma_unmap_addr(first_buf, mapping), dma_unmap_addr(first_buf, mapping),
fp->rx_buf_size, DMA_FROM_DEVICE); fp->rx_buf_size, DMA_FROM_DEVICE);
dev_kfree_skb(skb); kfree(data);
first_buf->skb = NULL; first_buf->data = NULL;
} }
} }
...@@ -1148,7 +1149,7 @@ static inline int bnx2x_alloc_rx_bds(struct bnx2x_fastpath *fp, ...@@ -1148,7 +1149,7 @@ static inline int bnx2x_alloc_rx_bds(struct bnx2x_fastpath *fp,
* fp->eth_q_stats.rx_skb_alloc_failed = 0 * fp->eth_q_stats.rx_skb_alloc_failed = 0
*/ */
for (i = 0; i < rx_ring_size; i++) { for (i = 0; i < rx_ring_size; i++) {
if (bnx2x_alloc_rx_skb(bp, fp, ring_prod) < 0) { if (bnx2x_alloc_rx_data(bp, fp, ring_prod) < 0) {
fp->eth_q_stats.rx_skb_alloc_failed++; fp->eth_q_stats.rx_skb_alloc_failed++;
continue; continue;
} }
......
...@@ -1740,6 +1740,7 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode) ...@@ -1740,6 +1740,7 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
struct sw_rx_bd *rx_buf; struct sw_rx_bd *rx_buf;
u16 len; u16 len;
int rc = -ENODEV; int rc = -ENODEV;
u8 *data;
/* check the loopback mode */ /* check the loopback mode */
switch (loopback_mode) { switch (loopback_mode) {
...@@ -1865,10 +1866,9 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode) ...@@ -1865,10 +1866,9 @@ static int bnx2x_run_loopback(struct bnx2x *bp, int loopback_mode)
dma_sync_single_for_cpu(&bp->pdev->dev, dma_sync_single_for_cpu(&bp->pdev->dev,
dma_unmap_addr(rx_buf, mapping), dma_unmap_addr(rx_buf, mapping),
fp_rx->rx_buf_size, DMA_FROM_DEVICE); fp_rx->rx_buf_size, DMA_FROM_DEVICE);
skb = rx_buf->skb; data = rx_buf->data + NET_SKB_PAD + cqe->fast_path_cqe.placement_offset;
skb_reserve(skb, cqe->fast_path_cqe.placement_offset);
for (i = ETH_HLEN; i < pkt_size; i++) for (i = ETH_HLEN; i < pkt_size; i++)
if (*(skb->data + i) != (unsigned char) (i & 0xff)) if (*(data + i) != (unsigned char) (i & 0xff))
goto test_loopback_rx_exit; goto test_loopback_rx_exit;
rc = 0; rc = 0;
......
...@@ -2789,8 +2789,8 @@ static void bnx2x_pf_rx_q_prep(struct bnx2x *bp, ...@@ -2789,8 +2789,8 @@ static void bnx2x_pf_rx_q_prep(struct bnx2x *bp,
/* This should be a maximum number of data bytes that may be /* This should be a maximum number of data bytes that may be
* placed on the BD (not including paddings). * placed on the BD (not including paddings).
*/ */
rxq_init->buf_sz = fp->rx_buf_size - BNX2X_FW_RX_ALIGN - rxq_init->buf_sz = fp->rx_buf_size - BNX2X_FW_RX_ALIGN_START -
IP_HEADER_ALIGNMENT_PADDING; BNX2X_FW_RX_ALIGN_END - IP_HEADER_ALIGNMENT_PADDING;
rxq_init->cl_qzone_id = fp->cl_qzone_id; rxq_init->cl_qzone_id = fp->cl_qzone_id;
rxq_init->tpa_agg_sz = tpa_agg_size; rxq_init->tpa_agg_sz = tpa_agg_size;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment