Commit 1943d8ba authored by Jesse Brandeburg's avatar Jesse Brandeburg Committed by Jeff Kirsher

i40e/i40evf: enable hardware feature head write back

The hardware supports a feature to avoid updating the descriptor
ring by marking each descriptor with a DD bit, and instead
writes a memory location with an update to where the driver
should clean up to.  Enable this feature.

Change-ID: I5da4e0681f0b581a6401c950a81808792267fe57
Signed-off-by: default avatarJesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: default avatarMitch Williams <mitch.a.williams@intel.com>
Signed-off-by: default avatarCatherine Sullivan <catherine.sullivan@intel.com>
Tested-by: default avatarKavindya Deegala <kavindya.s.deegala@intel.com>
Signed-off-by: default avatarJeff Kirsher <jeffrey.t.kirsher@intel.com>
parent 6c167f58
...@@ -2181,6 +2181,11 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring) ...@@ -2181,6 +2181,11 @@ static int i40e_configure_tx_ring(struct i40e_ring *ring)
tx_ctx.fd_ena = !!(vsi->back->flags & (I40E_FLAG_FD_SB_ENABLED | tx_ctx.fd_ena = !!(vsi->back->flags & (I40E_FLAG_FD_SB_ENABLED |
I40E_FLAG_FD_ATR_ENABLED)); I40E_FLAG_FD_ATR_ENABLED));
tx_ctx.timesync_ena = !!(vsi->back->flags & I40E_FLAG_PTP); tx_ctx.timesync_ena = !!(vsi->back->flags & I40E_FLAG_PTP);
/* FDIR VSI tx ring can still use RS bit and writebacks */
if (vsi->type != I40E_VSI_FDIR)
tx_ctx.head_wb_ena = 1;
tx_ctx.head_wb_addr = ring->dma +
(ring->count * sizeof(struct i40e_tx_desc));
/* As part of VSI creation/update, FW allocates certain /* As part of VSI creation/update, FW allocates certain
* Tx arbitration queue sets for each TC enabled for * Tx arbitration queue sets for each TC enabled for
......
...@@ -618,6 +618,20 @@ static bool i40e_check_tx_hang(struct i40e_ring *tx_ring) ...@@ -618,6 +618,20 @@ static bool i40e_check_tx_hang(struct i40e_ring *tx_ring)
return ret; return ret;
} }
/**
* i40e_get_head - Retrieve head from head writeback
* @tx_ring: tx ring to fetch head of
*
* Returns value of Tx ring head based on value stored
* in head write-back location
**/
static inline u32 i40e_get_head(struct i40e_ring *tx_ring)
{
void *head = (struct i40e_tx_desc *)tx_ring->desc + tx_ring->count;
return le32_to_cpu(*(volatile __le32 *)head);
}
/** /**
* i40e_clean_tx_irq - Reclaim resources after transmit completes * i40e_clean_tx_irq - Reclaim resources after transmit completes
* @tx_ring: tx ring to clean * @tx_ring: tx ring to clean
...@@ -629,6 +643,7 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) ...@@ -629,6 +643,7 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
{ {
u16 i = tx_ring->next_to_clean; u16 i = tx_ring->next_to_clean;
struct i40e_tx_buffer *tx_buf; struct i40e_tx_buffer *tx_buf;
struct i40e_tx_desc *tx_head;
struct i40e_tx_desc *tx_desc; struct i40e_tx_desc *tx_desc;
unsigned int total_packets = 0; unsigned int total_packets = 0;
unsigned int total_bytes = 0; unsigned int total_bytes = 0;
...@@ -637,6 +652,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) ...@@ -637,6 +652,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
tx_desc = I40E_TX_DESC(tx_ring, i); tx_desc = I40E_TX_DESC(tx_ring, i);
i -= tx_ring->count; i -= tx_ring->count;
tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
do { do {
struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch; struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
...@@ -647,9 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) ...@@ -647,9 +664,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
/* prevent any other reads prior to eop_desc */ /* prevent any other reads prior to eop_desc */
read_barrier_depends(); read_barrier_depends();
/* if the descriptor isn't done, no work yet to do */ /* we have caught up to head, no work left to do */
if (!(eop_desc->cmd_type_offset_bsz & if (tx_head == tx_desc)
cpu_to_le64(I40E_TX_DESC_DTYPE_DESC_DONE)))
break; break;
/* clear next_to_watch to prevent false hangs */ /* clear next_to_watch to prevent false hangs */
...@@ -905,6 +921,10 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring) ...@@ -905,6 +921,10 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
/* round up to nearest 4K */ /* round up to nearest 4K */
tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc); tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
/* add u32 for head writeback, align after this takes care of
* guaranteeing this is at least one cache line in size
*/
tx_ring->size += sizeof(u32);
tx_ring->size = ALIGN(tx_ring->size, 4096); tx_ring->size = ALIGN(tx_ring->size, 4096);
tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size, tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
&tx_ring->dma, GFP_KERNEL); &tx_ring->dma, GFP_KERNEL);
...@@ -2042,9 +2062,23 @@ static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, ...@@ -2042,9 +2062,23 @@ static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
tx_bi = &tx_ring->tx_bi[i]; tx_bi = &tx_ring->tx_bi[i];
} }
/* Place RS bit on last descriptor of any packet that spans across the
* 4th descriptor (WB_STRIDE aka 0x3) in a 64B cacheline.
*/
#define WB_STRIDE 0x3
if (((i & WB_STRIDE) != WB_STRIDE) &&
(first <= &tx_ring->tx_bi[i]) &&
(first >= &tx_ring->tx_bi[i & ~WB_STRIDE])) {
tx_desc->cmd_type_offset_bsz = tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag) | build_ctob(td_cmd, td_offset, size, td_tag) |
cpu_to_le64((u64)I40E_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT); cpu_to_le64((u64)I40E_TX_DESC_CMD_EOP <<
I40E_TXD_QW1_CMD_SHIFT);
} else {
tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag) |
cpu_to_le64((u64)I40E_TXD_CMD <<
I40E_TXD_QW1_CMD_SHIFT);
}
netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev, netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev,
tx_ring->queue_index), tx_ring->queue_index),
......
...@@ -230,6 +230,9 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_idx, ...@@ -230,6 +230,9 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_idx,
tx_ctx.qlen = info->ring_len; tx_ctx.qlen = info->ring_len;
tx_ctx.rdylist = le16_to_cpu(pf->vsi[vsi_idx]->info.qs_handle[0]); tx_ctx.rdylist = le16_to_cpu(pf->vsi[vsi_idx]->info.qs_handle[0]);
tx_ctx.rdylist_act = 0; tx_ctx.rdylist_act = 0;
tx_ctx.head_wb_ena = 1;
tx_ctx.head_wb_addr = info->dma_ring_addr +
(info->ring_len * sizeof(struct i40e_tx_desc));
/* clear the context in the HMC */ /* clear the context in the HMC */
ret = i40e_clear_lan_tx_queue_context(hw, pf_queue_id); ret = i40e_clear_lan_tx_queue_context(hw, pf_queue_id);
......
...@@ -169,6 +169,20 @@ static bool i40e_check_tx_hang(struct i40e_ring *tx_ring) ...@@ -169,6 +169,20 @@ static bool i40e_check_tx_hang(struct i40e_ring *tx_ring)
return ret; return ret;
} }
/**
* i40e_get_head - Retrieve head from head writeback
* @tx_ring: tx ring to fetch head of
*
* Returns value of Tx ring head based on value stored
* in head write-back location
**/
static inline u32 i40e_get_head(struct i40e_ring *tx_ring)
{
void *head = (struct i40e_tx_desc *)tx_ring->desc + tx_ring->count;
return le32_to_cpu(*(volatile __le32 *)head);
}
/** /**
* i40e_clean_tx_irq - Reclaim resources after transmit completes * i40e_clean_tx_irq - Reclaim resources after transmit completes
* @tx_ring: tx ring to clean * @tx_ring: tx ring to clean
...@@ -180,6 +194,7 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) ...@@ -180,6 +194,7 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
{ {
u16 i = tx_ring->next_to_clean; u16 i = tx_ring->next_to_clean;
struct i40e_tx_buffer *tx_buf; struct i40e_tx_buffer *tx_buf;
struct i40e_tx_desc *tx_head;
struct i40e_tx_desc *tx_desc; struct i40e_tx_desc *tx_desc;
unsigned int total_packets = 0; unsigned int total_packets = 0;
unsigned int total_bytes = 0; unsigned int total_bytes = 0;
...@@ -188,6 +203,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) ...@@ -188,6 +203,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
tx_desc = I40E_TX_DESC(tx_ring, i); tx_desc = I40E_TX_DESC(tx_ring, i);
i -= tx_ring->count; i -= tx_ring->count;
tx_head = I40E_TX_DESC(tx_ring, i40e_get_head(tx_ring));
do { do {
struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch; struct i40e_tx_desc *eop_desc = tx_buf->next_to_watch;
...@@ -198,9 +215,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget) ...@@ -198,9 +215,8 @@ static bool i40e_clean_tx_irq(struct i40e_ring *tx_ring, int budget)
/* prevent any other reads prior to eop_desc */ /* prevent any other reads prior to eop_desc */
read_barrier_depends(); read_barrier_depends();
/* if the descriptor isn't done, no work yet to do */ /* we have caught up to head, no work left to do */
if (!(eop_desc->cmd_type_offset_bsz & if (tx_head == tx_desc)
cpu_to_le64(I40E_TX_DESC_DTYPE_DESC_DONE)))
break; break;
/* clear next_to_watch to prevent false hangs */ /* clear next_to_watch to prevent false hangs */
...@@ -432,6 +448,10 @@ int i40evf_setup_tx_descriptors(struct i40e_ring *tx_ring) ...@@ -432,6 +448,10 @@ int i40evf_setup_tx_descriptors(struct i40e_ring *tx_ring)
/* round up to nearest 4K */ /* round up to nearest 4K */
tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc); tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc);
/* add u32 for head writeback, align after this takes care of
* guaranteeing this is at least one cache line in size
*/
tx_ring->size += sizeof(u32);
tx_ring->size = ALIGN(tx_ring->size, 4096); tx_ring->size = ALIGN(tx_ring->size, 4096);
tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size, tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
&tx_ring->dma, GFP_KERNEL); &tx_ring->dma, GFP_KERNEL);
...@@ -1377,9 +1397,23 @@ static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb, ...@@ -1377,9 +1397,23 @@ static void i40e_tx_map(struct i40e_ring *tx_ring, struct sk_buff *skb,
tx_bi = &tx_ring->tx_bi[i]; tx_bi = &tx_ring->tx_bi[i];
} }
/* Place RS bit on last descriptor of any packet that spans across the
* 4th descriptor (WB_STRIDE aka 0x3) in a 64B cacheline.
*/
#define WB_STRIDE 0x3
if (((i & WB_STRIDE) != WB_STRIDE) &&
(first <= &tx_ring->tx_bi[i]) &&
(first >= &tx_ring->tx_bi[i & ~WB_STRIDE])) {
tx_desc->cmd_type_offset_bsz = tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag) | build_ctob(td_cmd, td_offset, size, td_tag) |
cpu_to_le64((u64)I40E_TXD_CMD << I40E_TXD_QW1_CMD_SHIFT); cpu_to_le64((u64)I40E_TX_DESC_CMD_EOP <<
I40E_TXD_QW1_CMD_SHIFT);
} else {
tx_desc->cmd_type_offset_bsz =
build_ctob(td_cmd, td_offset, size, td_tag) |
cpu_to_le64((u64)I40E_TXD_CMD <<
I40E_TXD_QW1_CMD_SHIFT);
}
netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev, netdev_tx_sent_queue(netdev_get_tx_queue(tx_ring->netdev,
tx_ring->queue_index), tx_ring->queue_index),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment