Commit a8db76d4 authored by Sven Van Asbroeck's avatar Sven Van Asbroeck Committed by David S. Miller

lan743x: boost performance on cpu archs w/o dma cache snooping

The buffers in the lan743x driver's receive ring are always 9K,
even when the largest packet that can be received (the mtu) is
much smaller. This performs particularly badly on cpu archs
without dma cache snooping (such as ARM): each received packet
results in a 9K dma_{map|unmap} operation, which is very expensive
because cpu caches need to be invalidated.

Careful measurement of the driver rx path on armv7 reveals that
the cpu spends the majority of its time waiting for cache
invalidation.

Optimize by keeping the rx ring buffer size as close as possible
to the mtu. This limits the amount of cache that requires
invalidation.

This optimization would normally force us to re-allocate all
ring buffers when the mtu is changed - a disruptive event,
because it can only happen when the network interface is down.

Remove the need to re-allocate all ring buffers by adding support
for multi-buffer frames. Now any combination of mtu and ring
buffer size will work. When the mtu changes from mtu1 to mtu2,
consumed buffers of size mtu1 are lazily replaced by newly
allocated buffers of size mtu2.

These optimizations double the rx performance on armv7.
Third parties report 3x rx speedup on armv8.

Tested with iperf3 on a freescale imx6qp + lan7430, both sides
set to mtu 1500 bytes, measure rx performance:

Before:
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-20.00  sec   550 MBytes   231 Mbits/sec    0
After:
[ ID] Interval           Transfer     Bandwidth       Retr
[  4]   0.00-20.00  sec  1.33 GBytes   570 Mbits/sec    0
Signed-off-by: default avatarSven Van Asbroeck <thesven73@gmail.com>
Reviewed-by: default avatarBryan Whitehead <Bryan.Whitehead@microchip.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 80fea53d
...@@ -1926,15 +1926,6 @@ static int lan743x_rx_next_index(struct lan743x_rx *rx, int index) ...@@ -1926,15 +1926,6 @@ static int lan743x_rx_next_index(struct lan743x_rx *rx, int index)
return ((++index) % rx->ring_size); return ((++index) % rx->ring_size);
} }
static struct sk_buff *lan743x_rx_allocate_skb(struct lan743x_rx *rx)
{
int length = 0;
length = (LAN743X_MAX_FRAME_SIZE + ETH_HLEN + 4 + RX_HEAD_PADDING);
return __netdev_alloc_skb(rx->adapter->netdev,
length, GFP_ATOMIC | GFP_DMA);
}
static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index) static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index)
{ {
/* update the tail once per 8 descriptors */ /* update the tail once per 8 descriptors */
...@@ -1943,36 +1934,40 @@ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index) ...@@ -1943,36 +1934,40 @@ static void lan743x_rx_update_tail(struct lan743x_rx *rx, int index)
index); index);
} }
static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index, static int lan743x_rx_init_ring_element(struct lan743x_rx *rx, int index)
struct sk_buff *skb)
{ {
struct net_device *netdev = rx->adapter->netdev;
struct device *dev = &rx->adapter->pdev->dev;
struct lan743x_rx_buffer_info *buffer_info; struct lan743x_rx_buffer_info *buffer_info;
struct lan743x_rx_descriptor *descriptor; struct lan743x_rx_descriptor *descriptor;
int length = 0; struct sk_buff *skb;
dma_addr_t dma_ptr;
int length;
length = netdev->mtu + ETH_HLEN + 4 + RX_HEAD_PADDING;
length = (LAN743X_MAX_FRAME_SIZE + ETH_HLEN + 4 + RX_HEAD_PADDING);
descriptor = &rx->ring_cpu_ptr[index]; descriptor = &rx->ring_cpu_ptr[index];
buffer_info = &rx->buffer_info[index]; buffer_info = &rx->buffer_info[index];
buffer_info->skb = skb; skb = __netdev_alloc_skb(netdev, length, GFP_ATOMIC | GFP_DMA);
if (!(buffer_info->skb)) if (!skb)
return -ENOMEM; return -ENOMEM;
buffer_info->dma_ptr = dma_map_single(&rx->adapter->pdev->dev, dma_ptr = dma_map_single(dev, skb->data, length, DMA_FROM_DEVICE);
buffer_info->skb->data, if (dma_mapping_error(dev, dma_ptr)) {
length, dev_kfree_skb_any(skb);
DMA_FROM_DEVICE);
if (dma_mapping_error(&rx->adapter->pdev->dev,
buffer_info->dma_ptr)) {
buffer_info->dma_ptr = 0;
return -ENOMEM; return -ENOMEM;
} }
if (buffer_info->dma_ptr)
dma_unmap_single(dev, buffer_info->dma_ptr,
buffer_info->buffer_length, DMA_FROM_DEVICE);
buffer_info->skb = skb;
buffer_info->dma_ptr = dma_ptr;
buffer_info->buffer_length = length; buffer_info->buffer_length = length;
descriptor->data1 = cpu_to_le32(DMA_ADDR_LOW32(buffer_info->dma_ptr)); descriptor->data1 = cpu_to_le32(DMA_ADDR_LOW32(buffer_info->dma_ptr));
descriptor->data2 = cpu_to_le32(DMA_ADDR_HIGH32(buffer_info->dma_ptr)); descriptor->data2 = cpu_to_le32(DMA_ADDR_HIGH32(buffer_info->dma_ptr));
descriptor->data3 = 0; descriptor->data3 = 0;
descriptor->data0 = cpu_to_le32((RX_DESC_DATA0_OWN_ | descriptor->data0 = cpu_to_le32((RX_DESC_DATA0_OWN_ |
(length & RX_DESC_DATA0_BUF_LENGTH_MASK_))); (length & RX_DESC_DATA0_BUF_LENGTH_MASK_)));
skb_reserve(buffer_info->skb, RX_HEAD_PADDING);
lan743x_rx_update_tail(rx, index); lan743x_rx_update_tail(rx, index);
return 0; return 0;
...@@ -2021,16 +2016,32 @@ static void lan743x_rx_release_ring_element(struct lan743x_rx *rx, int index) ...@@ -2021,16 +2016,32 @@ static void lan743x_rx_release_ring_element(struct lan743x_rx *rx, int index)
memset(buffer_info, 0, sizeof(*buffer_info)); memset(buffer_info, 0, sizeof(*buffer_info));
} }
static int lan743x_rx_process_packet(struct lan743x_rx *rx) static struct sk_buff *
lan743x_rx_trim_skb(struct sk_buff *skb, int frame_length)
{
if (skb_linearize(skb)) {
dev_kfree_skb_irq(skb);
return NULL;
}
frame_length = max_t(int, 0, frame_length - RX_HEAD_PADDING - 2);
if (skb->len > frame_length) {
skb->tail -= skb->len - frame_length;
skb->len = frame_length;
}
return skb;
}
static int lan743x_rx_process_buffer(struct lan743x_rx *rx)
{ {
struct skb_shared_hwtstamps *hwtstamps = NULL;
int result = RX_PROCESS_RESULT_NOTHING_TO_DO;
int current_head_index = le32_to_cpu(*rx->head_cpu_ptr); int current_head_index = le32_to_cpu(*rx->head_cpu_ptr);
struct lan743x_rx_descriptor *descriptor, *desc_ext;
struct net_device *netdev = rx->adapter->netdev;
int result = RX_PROCESS_RESULT_NOTHING_TO_DO;
struct lan743x_rx_buffer_info *buffer_info; struct lan743x_rx_buffer_info *buffer_info;
struct lan743x_rx_descriptor *descriptor; int frame_length, buffer_length;
int extension_index = -1; int extension_index = -1;
int first_index = -1; bool is_last, is_first;
int last_index = -1; struct sk_buff *skb;
if (current_head_index < 0 || current_head_index >= rx->ring_size) if (current_head_index < 0 || current_head_index >= rx->ring_size)
goto done; goto done;
...@@ -2038,163 +2049,120 @@ static int lan743x_rx_process_packet(struct lan743x_rx *rx) ...@@ -2038,163 +2049,120 @@ static int lan743x_rx_process_packet(struct lan743x_rx *rx)
if (rx->last_head < 0 || rx->last_head >= rx->ring_size) if (rx->last_head < 0 || rx->last_head >= rx->ring_size)
goto done; goto done;
if (rx->last_head != current_head_index) { if (rx->last_head == current_head_index)
goto done;
descriptor = &rx->ring_cpu_ptr[rx->last_head]; descriptor = &rx->ring_cpu_ptr[rx->last_head];
if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_OWN_) if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_OWN_)
goto done; goto done;
buffer_info = &rx->buffer_info[rx->last_head];
if (!(le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_FS_)) is_last = le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_;
goto done; is_first = le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_FS_;
first_index = rx->last_head; if (is_last && le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_EXT_) {
if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_) { /* extension is expected to follow */
last_index = rx->last_head; int index = lan743x_rx_next_index(rx, rx->last_head);
} else {
int index;
index = lan743x_rx_next_index(rx, first_index); if (index == current_head_index)
while (index != current_head_index) { /* extension not yet available */
descriptor = &rx->ring_cpu_ptr[index];
if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_OWN_)
goto done; goto done;
desc_ext = &rx->ring_cpu_ptr[index];
if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_LS_) { if (le32_to_cpu(desc_ext->data0) & RX_DESC_DATA0_OWN_)
last_index = index; /* extension not yet available */
break;
}
index = lan743x_rx_next_index(rx, index);
}
}
if (last_index >= 0) {
descriptor = &rx->ring_cpu_ptr[last_index];
if (le32_to_cpu(descriptor->data0) & RX_DESC_DATA0_EXT_) {
/* extension is expected to follow */
int index = lan743x_rx_next_index(rx,
last_index);
if (index != current_head_index) {
descriptor = &rx->ring_cpu_ptr[index];
if (le32_to_cpu(descriptor->data0) &
RX_DESC_DATA0_OWN_) {
goto done; goto done;
} if (!(le32_to_cpu(desc_ext->data0) & RX_DESC_DATA0_EXT_))
if (le32_to_cpu(descriptor->data0) & goto move_forward;
RX_DESC_DATA0_EXT_) {
extension_index = index; extension_index = index;
} else {
goto done;
}
} else {
/* extension is not yet available */
/* prevent processing of this packet */
first_index = -1;
last_index = -1;
}
}
} }
}
if (first_index >= 0 && last_index >= 0) {
int real_last_index = last_index;
struct sk_buff *skb = NULL;
u32 ts_sec = 0;
u32 ts_nsec = 0;
/* packet is available */ /* Only the last buffer in a multi-buffer frame contains the total frame
if (first_index == last_index) { * length. The chip occasionally sends more buffers than strictly
/* single buffer packet */ * required to reach the total frame length.
struct sk_buff *new_skb = NULL; * Handle this by adding all buffers to the skb in their entirety.
int packet_length; * Once the real frame length is known, trim the skb.
*/
frame_length =
RX_DESC_DATA0_FRAME_LENGTH_GET_(le32_to_cpu(descriptor->data0));
buffer_length = buffer_info->buffer_length;
netdev_dbg(netdev, "%s%schunk: %d/%d",
is_first ? "first " : " ",
is_last ? "last " : " ",
frame_length, buffer_length);
new_skb = lan743x_rx_allocate_skb(rx); /* save existing skb, allocate new skb and map to dma */
if (!new_skb) { skb = buffer_info->skb;
if (lan743x_rx_init_ring_element(rx, rx->last_head)) {
/* failed to allocate next skb. /* failed to allocate next skb.
* Memory is very low. * Memory is very low.
* Drop this packet and reuse buffer. * Drop this packet and reuse buffer.
*/ */
lan743x_rx_reuse_ring_element(rx, first_index); lan743x_rx_reuse_ring_element(rx, rx->last_head);
/* drop packet that was being assembled */
dev_kfree_skb_irq(rx->skb_head);
rx->skb_head = NULL;
goto process_extension; goto process_extension;
} }
buffer_info = &rx->buffer_info[first_index]; /* add buffers to skb via skb->frag_list */
skb = buffer_info->skb; if (is_first) {
descriptor = &rx->ring_cpu_ptr[first_index]; skb_reserve(skb, RX_HEAD_PADDING);
skb_put(skb, buffer_length - RX_HEAD_PADDING);
/* unmap from dma */ if (rx->skb_head)
if (buffer_info->dma_ptr) { dev_kfree_skb_irq(rx->skb_head);
dma_unmap_single(&rx->adapter->pdev->dev, rx->skb_head = skb;
buffer_info->dma_ptr, } else if (rx->skb_head) {
buffer_info->buffer_length, skb_put(skb, buffer_length);
DMA_FROM_DEVICE); if (skb_shinfo(rx->skb_head)->frag_list)
buffer_info->dma_ptr = 0; rx->skb_tail->next = skb;
buffer_info->buffer_length = 0; else
} skb_shinfo(rx->skb_head)->frag_list = skb;
buffer_info->skb = NULL; rx->skb_tail = skb;
packet_length = RX_DESC_DATA0_FRAME_LENGTH_GET_ rx->skb_head->len += skb->len;
(le32_to_cpu(descriptor->data0)); rx->skb_head->data_len += skb->len;
skb_put(skb, packet_length - 4); rx->skb_head->truesize += skb->truesize;
skb->protocol = eth_type_trans(skb,
rx->adapter->netdev);
lan743x_rx_init_ring_element(rx, first_index, new_skb);
} else { } else {
int index = first_index; /* packet to assemble has already been dropped because one or
* more of its buffers could not be allocated
/* multi buffer packet not supported */
/* this should not happen since
* buffers are allocated to be at least jumbo size
*/ */
netdev_dbg(netdev, "drop buffer intended for dropped packet");
/* clean up buffers */ dev_kfree_skb_irq(skb);
if (first_index <= last_index) {
while ((index >= first_index) &&
(index <= last_index)) {
lan743x_rx_reuse_ring_element(rx,
index);
index = lan743x_rx_next_index(rx,
index);
}
} else {
while ((index >= first_index) ||
(index <= last_index)) {
lan743x_rx_reuse_ring_element(rx,
index);
index = lan743x_rx_next_index(rx,
index);
}
}
} }
process_extension: process_extension:
if (extension_index >= 0) { if (extension_index >= 0) {
descriptor = &rx->ring_cpu_ptr[extension_index]; u32 ts_sec;
buffer_info = &rx->buffer_info[extension_index]; u32 ts_nsec;
ts_sec = le32_to_cpu(descriptor->data1); ts_sec = le32_to_cpu(desc_ext->data1);
ts_nsec = (le32_to_cpu(descriptor->data2) & ts_nsec = (le32_to_cpu(desc_ext->data2) &
RX_DESC_DATA2_TS_NS_MASK_); RX_DESC_DATA2_TS_NS_MASK_);
if (rx->skb_head)
skb_hwtstamps(rx->skb_head)->hwtstamp =
ktime_set(ts_sec, ts_nsec);
lan743x_rx_reuse_ring_element(rx, extension_index); lan743x_rx_reuse_ring_element(rx, extension_index);
real_last_index = extension_index; rx->last_head = extension_index;
} netdev_dbg(netdev, "process extension");
if (!skb) {
result = RX_PROCESS_RESULT_PACKET_DROPPED;
goto move_forward;
} }
if (extension_index < 0) if (is_last && rx->skb_head)
goto pass_packet_to_os; rx->skb_head = lan743x_rx_trim_skb(rx->skb_head, frame_length);
hwtstamps = skb_hwtstamps(skb);
if (hwtstamps)
hwtstamps->hwtstamp = ktime_set(ts_sec, ts_nsec);
pass_packet_to_os: if (is_last && rx->skb_head) {
/* pass packet to OS */ rx->skb_head->protocol = eth_type_trans(rx->skb_head,
napi_gro_receive(&rx->napi, skb); rx->adapter->netdev);
result = RX_PROCESS_RESULT_PACKET_RECEIVED; netdev_dbg(netdev, "sending %d byte frame to OS",
rx->skb_head->len);
napi_gro_receive(&rx->napi, rx->skb_head);
rx->skb_head = NULL;
}
move_forward: move_forward:
/* push tail and head forward */ /* push tail and head forward */
rx->last_tail = real_last_index; rx->last_tail = rx->last_head;
rx->last_head = lan743x_rx_next_index(rx, real_last_index); rx->last_head = lan743x_rx_next_index(rx, rx->last_head);
} result = RX_PROCESS_RESULT_BUFFER_RECEIVED;
done: done:
return result; return result;
} }
...@@ -2213,12 +2181,12 @@ static int lan743x_rx_napi_poll(struct napi_struct *napi, int weight) ...@@ -2213,12 +2181,12 @@ static int lan743x_rx_napi_poll(struct napi_struct *napi, int weight)
DMAC_INT_BIT_RXFRM_(rx->channel_number)); DMAC_INT_BIT_RXFRM_(rx->channel_number));
} }
for (count = 0; count < weight; count++) { for (count = 0; count < weight; count++) {
result = lan743x_rx_process_packet(rx); result = lan743x_rx_process_buffer(rx);
if (result == RX_PROCESS_RESULT_NOTHING_TO_DO) if (result == RX_PROCESS_RESULT_NOTHING_TO_DO)
break; break;
} }
rx->frame_count += count; rx->frame_count += count;
if (count == weight || result == RX_PROCESS_RESULT_PACKET_RECEIVED) if (count == weight || result == RX_PROCESS_RESULT_BUFFER_RECEIVED)
return weight; return weight;
if (!napi_complete_done(napi, count)) if (!napi_complete_done(napi, count))
...@@ -2330,9 +2298,7 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx) ...@@ -2330,9 +2298,7 @@ static int lan743x_rx_ring_init(struct lan743x_rx *rx)
rx->last_head = 0; rx->last_head = 0;
for (index = 0; index < rx->ring_size; index++) { for (index = 0; index < rx->ring_size; index++) {
struct sk_buff *new_skb = lan743x_rx_allocate_skb(rx); ret = lan743x_rx_init_ring_element(rx, index);
ret = lan743x_rx_init_ring_element(rx, index, new_skb);
if (ret) if (ret)
goto cleanup; goto cleanup;
} }
......
...@@ -699,6 +699,8 @@ struct lan743x_rx { ...@@ -699,6 +699,8 @@ struct lan743x_rx {
struct napi_struct napi; struct napi_struct napi;
u32 frame_count; u32 frame_count;
struct sk_buff *skb_head, *skb_tail;
}; };
struct lan743x_adapter { struct lan743x_adapter {
...@@ -831,8 +833,7 @@ struct lan743x_rx_buffer_info { ...@@ -831,8 +833,7 @@ struct lan743x_rx_buffer_info {
#define LAN743X_RX_RING_SIZE (65) #define LAN743X_RX_RING_SIZE (65)
#define RX_PROCESS_RESULT_NOTHING_TO_DO (0) #define RX_PROCESS_RESULT_NOTHING_TO_DO (0)
#define RX_PROCESS_RESULT_PACKET_RECEIVED (1) #define RX_PROCESS_RESULT_BUFFER_RECEIVED (1)
#define RX_PROCESS_RESULT_PACKET_DROPPED (2)
u32 lan743x_csr_read(struct lan743x_adapter *adapter, int offset); u32 lan743x_csr_read(struct lan743x_adapter *adapter, int offset);
void lan743x_csr_write(struct lan743x_adapter *adapter, int offset, u32 data); void lan743x_csr_write(struct lan743x_adapter *adapter, int offset, u32 data);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment