Commit cb7db356 authored by Brett Creeley's avatar Brett Creeley Committed by Jeff Kirsher

ice: Only bump Rx tail and release buffers once per napi_poll

Currently we bump the Rx tail and release/give buffers to hardware every
16 descriptors. This causes us to bump Rx tail up to 4 times per
napi_poll call. Also we are always bumping tail on an odd index and this
is a problem because hardware ignores the lower 3 bits in the QRX_TAIL
register. This is making it so hardware sees tail bumps only every 8
descriptors. Instead lets only bump Rx tail once per napi_poll if
the value aligns with hardware's expectations of the lower 3 bits being
cleared. Also only release/give Rx buffers once per napi_poll call.
Signed-off-by: default avatarBrett Creeley <brett.creeley@intel.com>
Tested-by: default avatarAndrew Bowers <andrewx.bowers@intel.com>
Signed-off-by: default avatarJeff Kirsher <jeffrey.t.kirsher@intel.com>
parent c7aeb4d1
...@@ -377,18 +377,28 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring) ...@@ -377,18 +377,28 @@ int ice_setup_rx_ring(struct ice_ring *rx_ring)
*/ */
static void ice_release_rx_desc(struct ice_ring *rx_ring, u32 val) static void ice_release_rx_desc(struct ice_ring *rx_ring, u32 val)
{ {
u16 prev_ntu = rx_ring->next_to_use;
rx_ring->next_to_use = val; rx_ring->next_to_use = val;
/* update next to alloc since we have filled the ring */ /* update next to alloc since we have filled the ring */
rx_ring->next_to_alloc = val; rx_ring->next_to_alloc = val;
/* Force memory writes to complete before letting h/w /* QRX_TAIL will be updated with any tail value, but hardware ignores
* know there are new descriptors to fetch. (Only * the lower 3 bits. This makes it so we only bump tail on meaningful
* applicable for weak-ordered memory model archs, * boundaries. Also, this allows us to bump tail on intervals of 8 up to
* such as IA-64). * the budget depending on the current traffic load.
*/ */
wmb(); val &= ~0x7;
writel(val, rx_ring->tail); if (prev_ntu != val) {
/* Force memory writes to complete before letting h/w
* know there are new descriptors to fetch. (Only
* applicable for weak-ordered memory model archs,
* such as IA-64).
*/
wmb();
writel(val, rx_ring->tail);
}
} }
/** /**
...@@ -445,7 +455,13 @@ ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi) ...@@ -445,7 +455,13 @@ ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi)
* @rx_ring: ring to place buffers on * @rx_ring: ring to place buffers on
* @cleaned_count: number of buffers to replace * @cleaned_count: number of buffers to replace
* *
* Returns false if all allocations were successful, true if any fail * Returns false if all allocations were successful, true if any fail. Returning
* true signals to the caller that we didn't replace cleaned_count buffers and
* there is more work to do.
*
* First, try to clean "cleaned_count" Rx buffers. Then refill the cleaned Rx
* buffers. Then bump tail at most one time. Grouping like this lets us avoid
* multiple tail writes per call.
*/ */
bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count) bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
{ {
...@@ -990,7 +1006,7 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) ...@@ -990,7 +1006,7 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
{ {
unsigned int total_rx_bytes = 0, total_rx_pkts = 0; unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
u16 cleaned_count = ICE_DESC_UNUSED(rx_ring); u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
bool failure = false; bool failure;
/* start the loop to process Rx packets bounded by 'budget' */ /* start the loop to process Rx packets bounded by 'budget' */
while (likely(total_rx_pkts < (unsigned int)budget)) { while (likely(total_rx_pkts < (unsigned int)budget)) {
...@@ -1002,13 +1018,6 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) ...@@ -1002,13 +1018,6 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
u16 vlan_tag = 0; u16 vlan_tag = 0;
u8 rx_ptype; u8 rx_ptype;
/* return some buffers to hardware, one at a time is too slow */
if (cleaned_count >= ICE_RX_BUF_WRITE) {
failure = failure ||
ice_alloc_rx_bufs(rx_ring, cleaned_count);
cleaned_count = 0;
}
/* get the Rx desc from Rx ring based on 'next_to_clean' */ /* get the Rx desc from Rx ring based on 'next_to_clean' */
rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean); rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
...@@ -1085,6 +1094,9 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget) ...@@ -1085,6 +1094,9 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
total_rx_pkts++; total_rx_pkts++;
} }
/* return up to cleaned_count buffers to hardware */
failure = ice_alloc_rx_bufs(rx_ring, cleaned_count);
/* update queue and vector specific stats */ /* update queue and vector specific stats */
u64_stats_update_begin(&rx_ring->syncp); u64_stats_update_begin(&rx_ring->syncp);
rx_ring->stats.pkts += total_rx_pkts; rx_ring->stats.pkts += total_rx_pkts;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment