Merge branch 'sfc-encap-offloads-on-EF10'

Edward Cree says: ==================== sfc: encap offloads on EF10 EF10 NICs from the 8000 series onwards support TX offloads (checksumming, TSO) on VXLAN- and NVGRE-encapsulated packets. This series adds driver support for these offloads. Changes from v1: * Fix 'no TXQ of type' error handling in patch #1 (and clear up the misleading comment that inspired the wrong version) * Add comment in patch #5 explaining what the deal with TSOv3 is ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'sfc-encap-offloads-on-EF10'
Edward Cree says: ==================== sfc: encap offloads on EF10 EF10 NICs from the 8000 series onwards support TX offloads (checksumming, TSO) on VXLAN- and NVGRE-encapsulated packets. This series adds driver support for these offloads. Changes from v1: * Fix 'no TXQ of type' error handling in patch #1 (and clear up the misleading comment that inspired the wrong version) * Add comment in patch #5 explaining what the deal with TSOv3 is ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
6b5525c8 · David S. Miller · 4a681bf3 · 24b2c375 · 6b5525c8 · 6b5525c8
Commit 6b5525c8 authored Sep 11, 2020 by David S. Miller
18 changed files
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -601,9 +601,13 @@ static int efx_ef10_probe(struct efx_nic *efx)
 	efx_ef10_read_licensed_features(efx);

 	/* We can have one VI for each vi_stride-byte region.
-	 * However, until we use TX option descriptors we need two TX queues
-	 * per channel.
+	 * However, until we use TX option descriptors we need up to four
+	 * TX queues per channel for different checksumming combinations.
 	 */
+	if (nic_data->datapath_caps &
+	    (1 << MC_CMD_GET_CAPABILITIES_OUT_VXLAN_NVGRE_LBN))
+		efx->tx_queues_per_channel = 4;
+	else
 		efx->tx_queues_per_channel = 2;
 	efx->max_vis = efx_ef10_mem_map_size(efx) / efx->vi_stride;
 	if (!efx->max_vis) {
@@ -1300,6 +1304,7 @@ static void efx_ef10_fini_nic(struct efx_nic *efx)
 static int efx_ef10_init_nic(struct efx_nic *efx)
 {
 	struct efx_ef10_nic_data *nic_data = efx->nic_data;
+	netdev_features_t hw_enc_features = 0;
 	int rc;

 	if (nic_data->must_check_datapath_caps) {
@@ -1344,6 +1349,21 @@ static int efx_ef10_init_nic(struct efx_nic *efx)
 		nic_data->must_restore_piobufs = false;
 	}

+	/* add encapsulated checksum offload features */
+	if (efx_has_cap(efx, VXLAN_NVGRE) && !efx_ef10_is_vf(efx))
+		hw_enc_features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
+	/* add encapsulated TSO features */
+	if (efx_has_cap(efx, TX_TSO_V2_ENCAP)) {
+		netdev_features_t encap_tso_features;
+
+		encap_tso_features = NETIF_F_GSO_UDP_TUNNEL | NETIF_F_GSO_GRE |
+			NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM;
+
+		hw_enc_features |= encap_tso_features | NETIF_F_TSO;
+		efx->net_dev->features |= encap_tso_features;
+	}
+	efx->net_dev->hw_enc_features = hw_enc_features;
+
 	/* don't fail init if RSS setup doesn't work */
 	rc = efx->type->rx_push_rss_config(efx, false,
 					   efx->rss_context.rx_indir_table, NULL);
@@ -2146,6 +2166,9 @@ static int efx_ef10_irq_test_generate(struct efx_nic *efx)

 static int efx_ef10_tx_probe(struct efx_tx_queue *tx_queue)
 {
+	/* low two bits of label are what we want for type */
+	BUILD_BUG_ON((EFX_TXQ_TYPE_OUTER_CSUM | EFX_TXQ_TYPE_INNER_CSUM) != 3);
+	tx_queue->type = tx_queue->label & 3;
 	return efx_nic_alloc_buffer(tx_queue->efx, &tx_queue->txd.buf,
 				    (tx_queue->ptr_mask + 1) *
 				    sizeof(efx_qword_t),
@@ -2168,15 +2191,15 @@ static inline void efx_ef10_push_tx_desc(struct efx_tx_queue *tx_queue,

 /* Add Firmware-Assisted TSO v2 option descriptors to a queue.
 */
-static int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue,
-				struct sk_buff *skb,
+int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
 			 bool *data_mapped)
 {
 	struct efx_tx_buffer *buffer;
+	u16 inner_ipv4_id = 0;
+	u16 outer_ipv4_id = 0;
 	struct tcphdr *tcp;
 	struct iphdr *ip;
-
-	u16 ipv4_id;
+	u16 ip_tot_len;
 	u32 seqnum;
 	u32 mss;

@@ -2189,21 +2212,43 @@ static int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue,
 		return -EINVAL;
 	}

+	if (skb->encapsulation) {
+		if (!tx_queue->tso_encap)
+			return -EINVAL;
+		ip = ip_hdr(skb);
+		if (ip->version == 4)
+			outer_ipv4_id = ntohs(ip->id);
+
+		ip = inner_ip_hdr(skb);
+		tcp = inner_tcp_hdr(skb);
+	} else {
 		ip = ip_hdr(skb);
+		tcp = tcp_hdr(skb);
+	}
+
+	/* 8000-series EF10 hardware requires that IP Total Length be
+	 * greater than or equal to the value it will have in each segment
+	 * (which is at most mss + 208 + TCP header length), but also less
+	 * than (0x10000 - inner_network_header).  Otherwise the TCP
+	 * checksum calculation will be broken for encapsulated packets.
+	 * We fill in ip->tot_len with 0xff30, which should satisfy the
+	 * first requirement unless the MSS is ridiculously large (which
+	 * should be impossible as the driver max MTU is 9216); it is
+	 * guaranteed to satisfy the second as we only attempt TSO if
+	 * inner_network_header <= 208.
+	 */
+	ip_tot_len = -EFX_TSO2_MAX_HDRLEN;
+	EFX_WARN_ON_ONCE_PARANOID(mss + EFX_TSO2_MAX_HDRLEN +
+				  (tcp->doff << 2u) > ip_tot_len);
+
 	if (ip->version == 4) {
-		/* Modify IPv4 header if needed. */
-		ip->tot_len = 0;
+		ip->tot_len = htons(ip_tot_len);
 		ip->check = 0;
-		ipv4_id = ntohs(ip->id);
+		inner_ipv4_id = ntohs(ip->id);
 	} else {
-		/* Modify IPv6 header if needed. */
-		struct ipv6hdr *ipv6 = ipv6_hdr(skb);
-
-		ipv6->payload_len = 0;
-		ipv4_id = 0;
+		((struct ipv6hdr *)ip)->payload_len = htons(ip_tot_len);
 	}

-	tcp = tcp_hdr(skb);
 	seqnum = ntohl(tcp->seq);

 	buffer = efx_tx_queue_get_insert_buffer(tx_queue);
@@ -2216,7 +2261,7 @@ static int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue,
 			ESF_DZ_TX_OPTION_TYPE, ESE_DZ_TX_OPTION_DESC_TSO,
 			ESF_DZ_TX_TSO_OPTION_TYPE,
 			ESE_DZ_TX_TSO_OPTION_DESC_FATSO2A,
-			ESF_DZ_TX_TSO_IP_ID, ipv4_id,
+			ESF_DZ_TX_TSO_IP_ID, inner_ipv4_id,
 			ESF_DZ_TX_TSO_TCP_SEQNO, seqnum
 			);
 	++tx_queue->insert_count;
@@ -2226,11 +2271,12 @@ static int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue,
 	buffer->flags = EFX_TX_BUF_OPTION;
 	buffer->len = 0;
 	buffer->unmap_len = 0;
-	EFX_POPULATE_QWORD_4(buffer->option,
+	EFX_POPULATE_QWORD_5(buffer->option,
 			ESF_DZ_TX_DESC_IS_OPT, 1,
 			ESF_DZ_TX_OPTION_TYPE, ESE_DZ_TX_OPTION_DESC_TSO,
 			ESF_DZ_TX_TSO_OPTION_TYPE,
 			ESE_DZ_TX_TSO_OPTION_DESC_FATSO2B,
+			ESF_DZ_TX_TSO_OUTER_IPID, outer_ipv4_id,
 			ESF_DZ_TX_TSO_TCP_MSS, mss
 			);
 	++tx_queue->insert_count;
@@ -2254,11 +2300,11 @@ static u32 efx_ef10_tso_versions(struct efx_nic *efx)

 static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue)
 {
-	bool csum_offload = tx_queue->label & EFX_TXQ_TYPE_OFFLOAD;
+	bool csum_offload = tx_queue->type & EFX_TXQ_TYPE_OUTER_CSUM;
+	bool inner_csum = tx_queue->type & EFX_TXQ_TYPE_INNER_CSUM;
 	struct efx_channel *channel = tx_queue->channel;
 	struct efx_nic *efx = tx_queue->efx;
 	struct efx_ef10_nic_data *nic_data;
-	bool tso_v2 = false;
 	efx_qword_t *txd;
 	int rc;

@@ -2281,15 +2327,18 @@ static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue)
 	 * TSOv2 cannot be used with Hardware timestamping, and is never needed
 	 * for XDP tx.
 	 */
-	if (csum_offload && (nic_data->datapath_caps2 &
-			(1 << MC_CMD_GET_CAPABILITIES_V2_OUT_TX_TSO_V2_LBN)) &&
+	if (efx_has_cap(efx, TX_TSO_V2)) {
+		if ((csum_offload || inner_csum) &&
 		    !tx_queue->timestamping && !tx_queue->xdp_tx) {
-		tso_v2 = true;
+			tx_queue->tso_version = 2;
 			netif_dbg(efx, hw, efx->net_dev, "Using TSOv2 for channel %u\n",
 				  channel->channel);
 		}
+	} else if (efx_has_cap(efx, TX_TSO)) {
+		tx_queue->tso_version = 1;
+	}

-	rc = efx_mcdi_tx_init(tx_queue, tso_v2);
+	rc = efx_mcdi_tx_init(tx_queue);
 	if (rc)
 		goto fail;

@@ -2302,22 +2351,19 @@ static void efx_ef10_tx_init(struct efx_tx_queue *tx_queue)
 	tx_queue->buffer[0].flags = EFX_TX_BUF_OPTION;
 	tx_queue->insert_count = 1;
 	txd = efx_tx_desc(tx_queue, 0);
-	EFX_POPULATE_QWORD_5(*txd,
+	EFX_POPULATE_QWORD_7(*txd,
 			     ESF_DZ_TX_DESC_IS_OPT, true,
 			     ESF_DZ_TX_OPTION_TYPE,
 			     ESE_DZ_TX_OPTION_DESC_CRC_CSUM,
 			     ESF_DZ_TX_OPTION_UDP_TCP_CSUM, csum_offload,
-			     ESF_DZ_TX_OPTION_IP_CSUM, csum_offload,
+			     ESF_DZ_TX_OPTION_IP_CSUM, csum_offload && tx_queue->tso_version != 2,
+			     ESF_DZ_TX_OPTION_INNER_UDP_TCP_CSUM, inner_csum,
+			     ESF_DZ_TX_OPTION_INNER_IP_CSUM, inner_csum && tx_queue->tso_version != 2,
 			     ESF_DZ_TX_TIMESTAMP, tx_queue->timestamping);
 	tx_queue->write_count = 1;

-	if (tso_v2) {
-		tx_queue->handle_tso = efx_ef10_tx_tso_desc;
-		tx_queue->tso_version = 2;
-	} else if (nic_data->datapath_caps &
-			(1 << MC_CMD_GET_CAPABILITIES_OUT_TX_TSO_LBN)) {
-		tx_queue->tso_version = 1;
-	}
+	if (tx_queue->tso_version == 2 && efx_has_cap(efx, TX_TSO_V2_ENCAP))
+		tx_queue->tso_encap = true;

 	wmb();
 	efx_ef10_push_tx_desc(tx_queue, txd);
@@ -2880,7 +2926,7 @@ efx_ef10_handle_tx_event(struct efx_channel *channel, efx_qword_t *event)
 	/* Get the transmit queue */
 	tx_ev_q_label = EFX_QWORD_FIELD(*event, ESF_DZ_TX_QLABEL);
 	tx_queue = efx_channel_get_tx_queue(channel,
-					    tx_ev_q_label % EFX_TXQ_TYPES);
+					    tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL);

 	if (!tx_queue->timestamping) {
 		/* Transmit completion */

--- a/drivers/net/ethernet/sfc/ef100_tx.c
+++ b/drivers/net/ethernet/sfc/ef100_tx.c
@@ -37,7 +37,14 @@ void ef100_tx_init(struct efx_tx_queue *tx_queue)
 				    tx_queue->channel->channel -
 				    tx_queue->efx->tx_channel_offset);

-	if (efx_mcdi_tx_init(tx_queue, false))
+	/* This value is purely documentational; as EF100 never passes through
+	 * the switch statement in tx.c:__efx_enqueue_skb(), that switch does
+	 * not handle case 3.  EF100's TSOv3 descriptors are generated by
+	 * ef100_make_tso_desc().
+	 * Meanwhile, all efx_mcdi_tx_init() cares about is that it's not 2.
+	 */
+	tx_queue->tso_version = 3;
+	if (efx_mcdi_tx_init(tx_queue))
 		netdev_WARN(tx_queue->efx->net_dev,
 			    "failed to initialise TXQ %d\n", tx_queue->queue);
 }

--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -596,6 +596,7 @@ static const struct net_device_ops efx_netdev_ops = {
 	.ndo_set_mac_address	= efx_set_mac_address,
 	.ndo_set_rx_mode	= efx_set_rx_mode,
 	.ndo_set_features	= efx_set_features,
+	.ndo_features_check	= efx_features_check,
 	.ndo_vlan_rx_add_vid	= efx_vlan_rx_add_vid,
 	.ndo_vlan_rx_kill_vid	= efx_vlan_rx_kill_vid,
 #ifdef CONFIG_SFC_SRIOV

--- a/drivers/net/ethernet/sfc/efx_channels.c
+++ b/drivers/net/ethernet/sfc/efx_channels.c
@@ -151,7 +151,7 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
 	 */

 	n_xdp_tx = num_possible_cpus();
-	n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, EFX_TXQ_TYPES);
+	n_xdp_ev = DIV_ROUND_UP(n_xdp_tx, EFX_MAX_TXQ_PER_CHANNEL);

 	vec_count = pci_msix_vec_count(efx->pci_dev);
 	if (vec_count < 0)
@@ -179,7 +179,7 @@ static int efx_allocate_msix_channels(struct efx_nic *efx,
 		efx->xdp_tx_queue_count = 0;
 	} else {
 		efx->n_xdp_channels = n_xdp_ev;
-		efx->xdp_tx_per_channel = EFX_TXQ_TYPES;
+		efx->xdp_tx_per_channel = EFX_MAX_TXQ_PER_CHANNEL;
 		efx->xdp_tx_queue_count = n_xdp_tx;
 		n_channels += n_xdp_ev;
 		netif_dbg(efx, drv, efx->net_dev,
@@ -520,7 +520,7 @@ static struct efx_channel *efx_alloc_channel(struct efx_nic *efx, int i)
 	channel->channel = i;
 	channel->type = &efx_default_channel_type;

-	for (j = 0; j < EFX_TXQ_TYPES; j++) {
+	for (j = 0; j < EFX_MAX_TXQ_PER_CHANNEL; j++) {
 		tx_queue = &channel->tx_queue[j];
 		tx_queue->efx = efx;
 		tx_queue->queue = -1;
@@ -594,7 +594,7 @@ struct efx_channel *efx_copy_channel(const struct efx_channel *old_channel)
 	channel->napi_str.state = 0;
 	memset(&channel->eventq, 0, sizeof(channel->eventq));

-	for (j = 0; j < EFX_TXQ_TYPES; j++) {
+	for (j = 0; j < EFX_MAX_TXQ_PER_CHANNEL; j++) {
 		tx_queue = &channel->tx_queue[j];
 		if (tx_queue->channel)
 			tx_queue->channel = channel;
@@ -894,7 +894,7 @@ int efx_set_channels(struct efx_nic *efx)
 						  xdp_queue_number, tx_queue->queue);
 					/* We may have a few left-over XDP TX
 					 * queues owing to xdp_tx_queue_count
-					 * not dividing evenly by EFX_TXQ_TYPES.
+					 * not dividing evenly by EFX_MAX_TXQ_PER_CHANNEL.
 					 * We still allocate and probe those
 					 * TXQs, but never use them.
 					 */

--- a/drivers/net/ethernet/sfc/efx_common.c
+++ b/drivers/net/ethernet/sfc/efx_common.c
@@ -11,6 +11,7 @@
 #include "net_driver.h"
 #include <linux/module.h>
 #include <linux/netdevice.h>
+#include <net/gre.h>
 #include "efx_common.h"
 #include "efx_channels.h"
 #include "efx.h"
@@ -1287,6 +1288,89 @@ const struct pci_error_handlers efx_err_handlers = {
 	.resume		= efx_io_resume,
 };

+/* Determine whether the NIC will be able to handle TX offloads for a given
+ * encapsulated packet.
+ */
+static bool efx_can_encap_offloads(struct efx_nic *efx, struct sk_buff *skb)
+{
+	struct gre_base_hdr *greh;
+	__be16 dst_port;
+	u8 ipproto;
+
+	/* Does the NIC support encap offloads?
+	 * If not, we should never get here, because we shouldn't have
+	 * advertised encap offload feature flags in the first place.
+	 */
+	if (WARN_ON_ONCE(!efx->type->udp_tnl_has_port))
+		return false;
+
+	/* Determine encapsulation protocol in use */
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		ipproto = ip_hdr(skb)->protocol;
+		break;
+	case htons(ETH_P_IPV6):
+		/* If there are extension headers, this will cause us to
+		 * think we can't offload something that we maybe could have.
+		 */
+		ipproto = ipv6_hdr(skb)->nexthdr;
+		break;
+	default:
+		/* Not IP, so can't offload it */
+		return false;
+	}
+	switch (ipproto) {
+	case IPPROTO_GRE:
+		/* We support NVGRE but not IP over GRE or random gretaps.
+		 * Specifically, the NIC will accept GRE as encapsulated if
+		 * the inner protocol is Ethernet, but only handle it
+		 * correctly if the GRE header is 8 bytes long.  Moreover,
+		 * it will not update the Checksum or Sequence Number fields
+		 * if they are present.  (The Routing Present flag,
+		 * GRE_ROUTING, cannot be set else the header would be more
+		 * than 8 bytes long; so we don't have to worry about it.)
+		 */
+		if (skb->inner_protocol_type != ENCAP_TYPE_ETHER)
+			return false;
+		if (ntohs(skb->inner_protocol) != ETH_P_TEB)
+			return false;
+		if (skb_inner_mac_header(skb) - skb_transport_header(skb) != 8)
+			return false;
+		greh = (struct gre_base_hdr *)skb_transport_header(skb);
+		return !(greh->flags & (GRE_CSUM | GRE_SEQ));
+	case IPPROTO_UDP:
+		/* If the port is registered for a UDP tunnel, we assume the
+		 * packet is for that tunnel, and the NIC will handle it as
+		 * such.  If not, the NIC won't know what to do with it.
+		 */
+		dst_port = udp_hdr(skb)->dest;
+		return efx->type->udp_tnl_has_port(efx, dst_port);
+	default:
+		return false;
+	}
+}
+
+netdev_features_t efx_features_check(struct sk_buff *skb, struct net_device *dev,
+				     netdev_features_t features)
+{
+	struct efx_nic *efx = netdev_priv(dev);
+
+	if (skb->encapsulation) {
+		if (features & NETIF_F_GSO_MASK)
+			/* Hardware can only do TSO with at most 208 bytes
+			 * of headers.
+			 */
+			if (skb_inner_transport_offset(skb) >
+			    EFX_TSO2_MAX_HDRLEN)
+				features &= ~(NETIF_F_GSO_MASK);
+		if (features & (NETIF_F_GSO_MASK | NETIF_F_CSUM_MASK))
+			if (!efx_can_encap_offloads(efx, skb))
+				features &= ~(NETIF_F_GSO_MASK |
+					      NETIF_F_CSUM_MASK);
+	}
+	return features;
+}
+
 int efx_get_phys_port_id(struct net_device *net_dev,
 			 struct netdev_phys_item_id *ppid)
 {

--- a/drivers/net/ethernet/sfc/efx_common.h
+++ b/drivers/net/ethernet/sfc/efx_common.h
@@ -105,6 +105,9 @@ int efx_change_mtu(struct net_device *net_dev, int new_mtu);

 extern const struct pci_error_handlers efx_err_handlers;

+netdev_features_t efx_features_check(struct sk_buff *skb, struct net_device *dev,
+				     netdev_features_t features);
+
 int efx_get_phys_port_id(struct net_device *net_dev,
 			 struct netdev_phys_item_id *ppid);


--- a/drivers/net/ethernet/sfc/ethtool_common.c
+++ b/drivers/net/ethernet/sfc/ethtool_common.c
@@ -407,7 +407,7 @@ static size_t efx_describe_per_queue_stats(struct efx_nic *efx, u8 *strings)
 				snprintf(strings, ETH_GSTRING_LEN,
 					 "tx-%u.tx_packets",
 					 channel->tx_queue[0].queue /
-					 EFX_TXQ_TYPES);
+					 EFX_MAX_TXQ_PER_CHANNEL);

 				strings += ETH_GSTRING_LEN;
 			}

--- a/drivers/net/ethernet/sfc/farch.c
+++ b/drivers/net/ethernet/sfc/farch.c
@@ -372,6 +372,8 @@ int efx_farch_tx_probe(struct efx_tx_queue *tx_queue)
 	struct efx_nic *efx = tx_queue->efx;
 	unsigned entries;

+	tx_queue->type = ((tx_queue->label & 1) ? EFX_TXQ_TYPE_OUTER_CSUM : 0) |
+			 ((tx_queue->label & 2) ? EFX_TXQ_TYPE_HIGHPRI : 0);
 	entries = tx_queue->ptr_mask + 1;
 	return efx_alloc_special_buffer(efx, &tx_queue->txd,
 					entries * sizeof(efx_qword_t));
@@ -379,7 +381,7 @@ int efx_farch_tx_probe(struct efx_tx_queue *tx_queue)

 void efx_farch_tx_init(struct efx_tx_queue *tx_queue)
 {
-	int csum = tx_queue->label & EFX_TXQ_TYPE_OFFLOAD;
+	int csum = tx_queue->type & EFX_TXQ_TYPE_OUTER_CSUM;
 	struct efx_nic *efx = tx_queue->efx;
 	efx_oword_t reg;

@@ -409,10 +411,12 @@ void efx_farch_tx_init(struct efx_tx_queue *tx_queue)

 	EFX_POPULATE_OWORD_1(reg,
 			     FRF_BZ_TX_PACE,
-			     (tx_queue->label & EFX_TXQ_TYPE_HIGHPRI) ?
+			     (tx_queue->type & EFX_TXQ_TYPE_HIGHPRI) ?
 			     FFE_BZ_TX_PACE_OFF :
 			     FFE_BZ_TX_PACE_RESERVED);
 	efx_writeo_table(efx, &reg, FR_BZ_TX_PACE_TBL, tx_queue->queue);
+
+	tx_queue->tso_version = 1;
 }

 static void efx_farch_flush_tx_queue(struct efx_tx_queue *tx_queue)
@@ -832,13 +836,13 @@ efx_farch_handle_tx_event(struct efx_channel *channel, efx_qword_t *event)
 		tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_DESC_PTR);
 		tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL);
 		tx_queue = efx_channel_get_tx_queue(
-			channel, tx_ev_q_label % EFX_TXQ_TYPES);
+			channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL);
 		efx_xmit_done(tx_queue, tx_ev_desc_ptr);
 	} else if (EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_WQ_FF_FULL)) {
 		/* Rewrite the FIFO write pointer */
 		tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL);
 		tx_queue = efx_channel_get_tx_queue(
-			channel, tx_ev_q_label % EFX_TXQ_TYPES);
+			channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL);

 		netif_tx_lock(efx->net_dev);
 		efx_farch_notify_tx_desc(tx_queue);
@@ -1080,9 +1084,9 @@ efx_farch_handle_tx_flush_done(struct efx_nic *efx, efx_qword_t *event)
 	int qid;

 	qid = EFX_QWORD_FIELD(*event, FSF_AZ_DRIVER_EV_SUBDATA);
-	if (qid < EFX_TXQ_TYPES * (efx->n_tx_channels + efx->n_extra_tx_channels)) {
-		tx_queue = efx_get_tx_queue(efx, qid / EFX_TXQ_TYPES,
-					    qid % EFX_TXQ_TYPES);
+	if (qid < EFX_MAX_TXQ_PER_CHANNEL * (efx->n_tx_channels + efx->n_extra_tx_channels)) {
+		tx_queue = efx_get_tx_queue(efx, qid / EFX_MAX_TXQ_PER_CHANNEL,
+					    qid % EFX_MAX_TXQ_PER_CHANNEL);
 		if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) {
 			efx_farch_magic_event(tx_queue->channel,
 					      EFX_CHANNEL_MAGIC_TX_DRAIN(tx_queue));
@@ -1675,10 +1679,10 @@ void efx_farch_dimension_resources(struct efx_nic *efx, unsigned sram_lim_qw)
 	 * and the descriptor caches for those channels.
 	 */
 	buftbl_min = ((efx->n_rx_channels * EFX_MAX_DMAQ_SIZE +
-		       total_tx_channels * EFX_TXQ_TYPES * EFX_MAX_DMAQ_SIZE +
+		       total_tx_channels * EFX_MAX_TXQ_PER_CHANNEL * EFX_MAX_DMAQ_SIZE +
 		       efx->n_channels * EFX_MAX_EVQ_SIZE)
 		      * sizeof(efx_qword_t) / EFX_BUF_SIZE);
-	vi_count = max(efx->n_channels, total_tx_channels * EFX_TXQ_TYPES);
+	vi_count = max(efx->n_channels, total_tx_channels * EFX_MAX_TXQ_PER_CHANNEL);

 #ifdef CONFIG_SFC_SRIOV
 	if (efx->type->sriov_wanted) {

--- a/drivers/net/ethernet/sfc/mcdi_functions.c
+++ b/drivers/net/ethernet/sfc/mcdi_functions.c
@@ -160,11 +160,12 @@ void efx_mcdi_ev_fini(struct efx_channel *channel)
 			       outbuf, outlen, rc);
 }

-int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue, bool tso_v2)
+int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue)
 {
 	MCDI_DECLARE_BUF(inbuf, MC_CMD_INIT_TXQ_IN_LEN(EFX_MAX_DMAQ_SIZE * 8 /
 						       EFX_BUF_SIZE));
-	bool csum_offload = tx_queue->label & EFX_TXQ_TYPE_OFFLOAD;
+	bool csum_offload = tx_queue->type & EFX_TXQ_TYPE_OUTER_CSUM;
+	bool inner_csum = tx_queue->type & EFX_TXQ_TYPE_INNER_CSUM;
 	size_t entries = tx_queue->txd.buf.len / EFX_BUF_SIZE;
 	struct efx_channel *channel = tx_queue->channel;
 	struct efx_nic *efx = tx_queue->efx;
@@ -194,22 +195,31 @@ int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue, bool tso_v2)
 	inlen = MC_CMD_INIT_TXQ_IN_LEN(entries);

 	do {
-		MCDI_POPULATE_DWORD_4(inbuf, INIT_TXQ_IN_FLAGS,
+		bool tso_v2 = tx_queue->tso_version == 2;
+
+		/* TSOv2 implies IP header checksum offload for TSO frames,
+		 * so we can safely disable IP header checksum offload for
+		 * everything else.  If we don't have TSOv2, then we have to
+		 * enable IP header checksum offload, which is strictly
+		 * incorrect but better than breaking TSO.
+		 */
+		MCDI_POPULATE_DWORD_6(inbuf, INIT_TXQ_IN_FLAGS,
 				/* This flag was removed from mcdi_pcol.h for
 				 * the non-_EXT version of INIT_TXQ.  However,
 				 * firmware still honours it.
 				 */
 				INIT_TXQ_EXT_IN_FLAG_TSOV2_EN, tso_v2,
-				INIT_TXQ_IN_FLAG_IP_CSUM_DIS, !csum_offload,
+				INIT_TXQ_IN_FLAG_IP_CSUM_DIS, !(csum_offload && tso_v2),
 				INIT_TXQ_IN_FLAG_TCP_CSUM_DIS, !csum_offload,
-				INIT_TXQ_EXT_IN_FLAG_TIMESTAMP,
-						tx_queue->timestamping);
+				INIT_TXQ_EXT_IN_FLAG_TIMESTAMP, tx_queue->timestamping,
+				INIT_TXQ_IN_FLAG_INNER_IP_CSUM_EN, inner_csum && !tso_v2,
+				INIT_TXQ_IN_FLAG_INNER_TCP_CSUM_EN, inner_csum);

 		rc = efx_mcdi_rpc_quiet(efx, MC_CMD_INIT_TXQ, inbuf, inlen,
 					NULL, 0, NULL);
 		if (rc == -ENOSPC && tso_v2) {
 			/* Retry without TSOv2 if we're short on contexts. */
-			tso_v2 = false;
+			tx_queue->tso_version = 0;
 			netif_warn(efx, probe, efx->net_dev,
 				   "TSOv2 context not available to segment in "
 				   "hardware. TCP performance may be reduced.\n"

--- a/drivers/net/ethernet/sfc/mcdi_functions.h
+++ b/drivers/net/ethernet/sfc/mcdi_functions.h
@@ -19,7 +19,7 @@ int efx_mcdi_ev_probe(struct efx_channel *channel);
 int efx_mcdi_ev_init(struct efx_channel *channel, bool v1_cut_thru, bool v2);
 void efx_mcdi_ev_remove(struct efx_channel *channel);
 void efx_mcdi_ev_fini(struct efx_channel *channel);
-int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue, bool tso_v2);
+int efx_mcdi_tx_init(struct efx_tx_queue *tx_queue);
 void efx_mcdi_tx_remove(struct efx_tx_queue *tx_queue);
 void efx_mcdi_tx_fini(struct efx_tx_queue *tx_queue);
 int efx_mcdi_rx_probe(struct efx_rx_queue *rx_queue);

--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -63,10 +63,13 @@
 * queues. */
 #define EFX_MAX_TX_TC		2
 #define EFX_MAX_CORE_TX_QUEUES	(EFX_MAX_TX_TC * EFX_MAX_CHANNELS)
-#define EFX_TXQ_TYPE_OFFLOAD	1	/* flag */
-#define EFX_TXQ_TYPE_HIGHPRI	2	/* flag */
-#define EFX_TXQ_TYPES		4
-#define EFX_MAX_TX_QUEUES	(EFX_TXQ_TYPES * EFX_MAX_CHANNELS)
+#define EFX_TXQ_TYPE_OUTER_CSUM	1	/* Outer checksum offload */
+#define EFX_TXQ_TYPE_INNER_CSUM	2	/* Inner checksum offload */
+#define EFX_TXQ_TYPE_HIGHPRI	4	/* High-priority (for TC) */
+#define EFX_TXQ_TYPES		8
+/* HIGHPRI is Siena-only, and INNER_CSUM is EF10, so no need for both */
+#define EFX_MAX_TXQ_PER_CHANNEL	4
+#define EFX_MAX_TX_QUEUES	(EFX_MAX_TXQ_PER_CHANNEL * EFX_MAX_CHANNELS)

 /* Maximum possible MTU the driver supports */
 #define EFX_MAX_MTU (9 * 1024)
@@ -74,6 +77,9 @@
 /* Minimum MTU, from RFC791 (IP) */
 #define EFX_MIN_MTU 68

+/* Maximum total header length for TSOv2 */
+#define EFX_TSO2_MAX_HDRLEN	208
+
 /* Size of an RX scatter buffer.  Small enough to pack 2 into a 4K page,
 * and should be a multiple of the cache line size.
 */
@@ -190,7 +196,9 @@ struct efx_tx_buffer {
 * @queue: DMA queue number
 * @label: Label for TX completion events.
 *	Is our index within @channel->tx_queue array.
+ * @type: configuration type of this TX queue.  A bitmask of %EFX_TXQ_TYPE_* flags.
 * @tso_version: Version of TSO in use for this queue.
+ * @tso_encap: Is encapsulated TSO supported? Supported in TSOv2 on 8000 series.
 * @channel: The associated channel
 * @core_txq: The networking core TX queue structure
 * @buffer: The software buffer ring
@@ -204,8 +212,6 @@ struct efx_tx_buffer {
 * @initialised: Has hardware queue been initialised?
 * @timestamping: Is timestamping enabled for this channel?
 * @xdp_tx: Is this an XDP tx queue?
- * @handle_tso: TSO xmit preparation handler.  Sets up the TSO metadata and
- *	may also map tx data, depending on the nature of the TSO implementation.
 * @read_count: Current read pointer.
 *	This is the number of buffers that have been removed from both rings.
 * @old_write_count: The value of @write_count when last checked.
@@ -254,7 +260,9 @@ struct efx_tx_queue {
 	struct efx_nic *efx ____cacheline_aligned_in_smp;
 	unsigned int queue;
 	unsigned int label;
+	unsigned int type;
 	unsigned int tso_version;
+	bool tso_encap;
 	struct efx_channel *channel;
 	struct netdev_queue *core_txq;
 	struct efx_tx_buffer *buffer;
@@ -267,9 +275,6 @@ struct efx_tx_queue {
 	bool timestamping;
 	bool xdp_tx;

-	/* Function pointers used in the fast path. */
-	int (*handle_tso)(struct efx_tx_queue*, struct sk_buff*, bool *);
-
 	/* Members used mainly on the completion path */
 	unsigned int read_count ____cacheline_aligned_in_smp;
 	unsigned int old_write_count;
@@ -479,6 +484,7 @@ enum efx_sync_events_state {
 * @rx_list: list of SKBs from current RX, awaiting processing
 * @rx_queue: RX queue for this channel
 * @tx_queue: TX queues for this channel
+ * @tx_queue_by_type: pointers into @tx_queue, or %NULL, indexed by txq type
 * @sync_events_state: Current state of sync events on this channel
 * @sync_timestamp_major: Major part of the last ptp sync event
 * @sync_timestamp_minor: Minor part of the last ptp sync event
@@ -540,7 +546,8 @@ struct efx_channel {
 	struct list_head *rx_list;

 	struct efx_rx_queue rx_queue;
-	struct efx_tx_queue tx_queue[EFX_TXQ_TYPES];
+	struct efx_tx_queue tx_queue[EFX_MAX_TXQ_PER_CHANNEL];
+	struct efx_tx_queue *tx_queue_by_type[EFX_TXQ_TYPES];

 	enum efx_sync_events_state sync_events_state;
 	u32 sync_timestamp_major;
@@ -1200,7 +1207,7 @@ struct efx_udp_tunnel {
 *	a pointer to the &struct efx_msi_context for the channel.
 * @irq_handle_legacy: Handle legacy interrupt.  The @dev_id argument
 *	is a pointer to the &struct efx_nic.
- * @tx_probe: Allocate resources for TX queue
+ * @tx_probe: Allocate resources for TX queue (and select TXQ type)
 * @tx_init: Initialise TX queue on the NIC
 * @tx_remove: Free resources for TX queue
 * @tx_write: Write TX descriptors and doorbell
@@ -1495,14 +1502,6 @@ efx_get_tx_channel(struct efx_nic *efx, unsigned int index)
 	return efx->channel[efx->tx_channel_offset + index];
 }

-static inline struct efx_tx_queue *
-efx_get_tx_queue(struct efx_nic *efx, unsigned index, unsigned type)
-{
-	EFX_WARN_ON_ONCE_PARANOID(index >= efx->n_tx_channels ||
-				  type >= efx->tx_queues_per_channel);
-	return &efx->channel[efx->tx_channel_offset + index]->tx_queue[type];
-}
-
 static inline struct efx_channel *
 efx_get_xdp_channel(struct efx_nic *efx, unsigned int index)
 {
@@ -1529,10 +1528,18 @@ static inline unsigned int efx_channel_num_tx_queues(struct efx_channel *channel
 }

 static inline struct efx_tx_queue *
-efx_channel_get_tx_queue(struct efx_channel *channel, unsigned type)
+efx_channel_get_tx_queue(struct efx_channel *channel, unsigned int type)
 {
-	EFX_WARN_ON_ONCE_PARANOID(type >= efx_channel_num_tx_queues(channel));
-	return &channel->tx_queue[type];
+	EFX_WARN_ON_ONCE_PARANOID(type >= EFX_TXQ_TYPES);
+	return channel->tx_queue_by_type[type];
+}
+
+static inline struct efx_tx_queue *
+efx_get_tx_queue(struct efx_nic *efx, unsigned int index, unsigned int type)
+{
+	struct efx_channel *channel = efx_get_tx_channel(efx, index);
+
+	return efx_channel_get_tx_queue(channel, type);
 }

 /* Iterate over all TX queues belonging to a channel */

--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -297,6 +297,10 @@ struct efx_ef10_nic_data {
 	u64 licensed_features;
 };

+/* TSOv2 */
+int efx_ef10_tx_tso_desc(struct efx_tx_queue *tx_queue, struct sk_buff *skb,
+			 bool *data_mapped);
+
 int efx_init_sriov(void);
 void efx_fini_sriov(void);


--- a/drivers/net/ethernet/sfc/ptp.c
+++ b/drivers/net/ethernet/sfc/ptp.c
@@ -43,6 +43,7 @@
 #include "mcdi_pcol.h"
 #include "io.h"
 #include "farch_regs.h"
+#include "tx.h"
 #include "nic.h" /* indirectly includes ptp.h */

 /* Maximum number of events expected to make up a PTP event */
@@ -1082,10 +1083,10 @@ static int efx_ptp_synchronize(struct efx_nic *efx, unsigned int num_readings)
 static void efx_ptp_xmit_skb_queue(struct efx_nic *efx, struct sk_buff *skb)
 {
 	struct efx_ptp_data *ptp_data = efx->ptp_data;
+	u8 type = efx_tx_csum_type_skb(skb);
 	struct efx_tx_queue *tx_queue;
-	u8 type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;

-	tx_queue = &ptp_data->channel->tx_queue[type];
+	tx_queue = efx_channel_get_tx_queue(ptp_data->channel, type);
 	if (tx_queue && tx_queue->timestamping) {
 		efx_enqueue_skb(tx_queue, skb);
 	} else {

--- a/drivers/net/ethernet/sfc/selftest.c
+++ b/drivers/net/ethernet/sfc/selftest.c
@@ -656,8 +656,8 @@ static int efx_test_loopbacks(struct efx_nic *efx, struct efx_self_tests *tests,

 		/* Test all enabled types of TX queue */
 		efx_for_each_channel_tx_queue(tx_queue, channel) {
-			state->offload_csum = (tx_queue->label &
-					       EFX_TXQ_TYPE_OFFLOAD);
+			state->offload_csum = (tx_queue->type &
+					       EFX_TXQ_TYPE_OUTER_CSUM);
 			rc = efx_test_loopback(tx_queue,
 					       &tests->loopback[mode]);
 			if (rc)

--- a/drivers/net/ethernet/sfc/selftest.h
+++ b/drivers/net/ethernet/sfc/selftest.h
@@ -15,8 +15,8 @@
 */

 struct efx_loopback_self_tests {
-	int tx_sent[EFX_TXQ_TYPES];
-	int tx_done[EFX_TXQ_TYPES];
+	int tx_sent[EFX_MAX_TXQ_PER_CHANNEL];
+	int tx_done[EFX_MAX_TXQ_PER_CHANNEL];
 	int rx_good;
 	int rx_bad;
 };

--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -338,8 +338,18 @@ netdev_tx_t __efx_enqueue_skb(struct efx_tx_queue *tx_queue, struct sk_buff *skb
 	 * size limit.
 	 */
 	if (segments) {
-		EFX_WARN_ON_ONCE_PARANOID(!tx_queue->handle_tso);
-		rc = tx_queue->handle_tso(tx_queue, skb, &data_mapped);
+		switch (tx_queue->tso_version) {
+		case 1:
+			rc = efx_enqueue_skb_tso(tx_queue, skb, &data_mapped);
+			break;
+		case 2:
+			rc = efx_ef10_tx_tso_desc(tx_queue, skb, &data_mapped);
+			break;
+		case 0: /* No TSO on this queue, SW fallback needed */
+		default:
+			rc = -EINVAL;
+			break;
+		}
 		if (rc == -EINVAL) {
 			rc = efx_tx_tso_fallback(tx_queue, skb);
 			tx_queue->tso_fallbacks++;
@@ -491,13 +501,10 @@ int efx_xdp_tx_buffers(struct efx_nic *efx, int n, struct xdp_frame **xdpfs,
 }

 /* Initiate a packet transmission.  We use one channel per CPU
- * (sharing when we have more CPUs than channels).  On Falcon, the TX
- * completion events will be directed back to the CPU that transmitted
- * the packet, which should be cache-efficient.
+ * (sharing when we have more CPUs than channels).
 *
 * Context: non-blocking.
- * Note that returning anything other than NETDEV_TX_OK will cause the
- * OS to free the skb.
+ * Should always return NETDEV_TX_OK and consume the skb.
 */
 netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 				struct net_device *net_dev)
@@ -509,7 +516,7 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 	EFX_WARN_ON_PARANOID(!netif_device_present(net_dev));

 	index = skb_get_queue_mapping(skb);
-	type = skb->ip_summed == CHECKSUM_PARTIAL ? EFX_TXQ_TYPE_OFFLOAD : 0;
+	type = efx_tx_csum_type_skb(skb);
 	if (index >= efx->n_tx_channels) {
 		index -= efx->n_tx_channels;
 		type |= EFX_TXQ_TYPE_HIGHPRI;
@@ -527,6 +534,20 @@ netdev_tx_t efx_hard_start_xmit(struct sk_buff *skb,
 	}

 	tx_queue = efx_get_tx_queue(efx, index, type);
+	if (WARN_ON_ONCE(!tx_queue)) {
+		/* We don't have a TXQ of the right type.
+		 * This should never happen, as we don't advertise offload
+		 * features unless we can support them.
+		 */
+		dev_kfree_skb_any(skb);
+		/* If we're not expecting another transmit and we had something to push
+		 * on this queue or a partner queue then we need to push here to get the
+		 * previous packets out.
+		 */
+		if (!netdev_xmit_more())
+			efx_tx_send_pending(tx_queue->channel);
+		return NETDEV_TX_OK;
+	}

 	return __efx_enqueue_skb(tx_queue, skb);
 }
@@ -577,7 +598,7 @@ void efx_init_tx_queue_core_txq(struct efx_tx_queue *tx_queue)
 	tx_queue->core_txq =
 		netdev_get_tx_queue(efx->net_dev,
 				    tx_queue->channel->channel +
-				    ((tx_queue->label & EFX_TXQ_TYPE_HIGHPRI) ?
+				    ((tx_queue->type & EFX_TXQ_TYPE_HIGHPRI) ?
 				     efx->n_tx_channels : 0));
 }


--- a/drivers/net/ethernet/sfc/tx.h
+++ b/drivers/net/ethernet/sfc/tx.h
@@ -18,4 +18,30 @@ unsigned int efx_tx_limit_len(struct efx_tx_queue *tx_queue,
 u8 *efx_tx_get_copy_buffer_limited(struct efx_tx_queue *tx_queue,
 				   struct efx_tx_buffer *buffer, size_t len);

+/* What TXQ type will satisfy the checksum offloads required for this skb? */
+static inline unsigned int efx_tx_csum_type_skb(struct sk_buff *skb)
+{
+	if (skb->ip_summed != CHECKSUM_PARTIAL)
+		return 0; /* no checksum offload */
+
+	if (skb->encapsulation &&
+	    skb_checksum_start_offset(skb) == skb_inner_transport_offset(skb)) {
+		/* we only advertise features for IPv4 and IPv6 checksums on
+		 * encapsulated packets, so if the checksum is for the inner
+		 * packet, it must be one of them; no further checking required.
+		 */
+
+		/* Do we also need to offload the outer header checksum? */
+		if (skb_shinfo(skb)->gso_segs > 1 &&
+		    !(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL) &&
+		    (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM))
+			return EFX_TXQ_TYPE_OUTER_CSUM | EFX_TXQ_TYPE_INNER_CSUM;
+		return EFX_TXQ_TYPE_INNER_CSUM;
+	}
+
+	/* similarly, we only advertise features for IPv4 and IPv6 checksums,
+	 * so it must be one of them. No need for further checks.
+	 */
+	return EFX_TXQ_TYPE_OUTER_CSUM;
+}
 #endif /* EFX_TX_H */
--- a/drivers/net/ethernet/sfc/tx_common.c
+++ b/drivers/net/ethernet/sfc/tx_common.c
@@ -47,11 +47,12 @@ int efx_probe_tx_queue(struct efx_tx_queue *tx_queue)
 		goto fail1;
 	}

-	/* Allocate hardware ring */
+	/* Allocate hardware ring, determine TXQ type */
 	rc = efx_nic_probe_tx(tx_queue);
 	if (rc)
 		goto fail2;

+	tx_queue->channel->tx_queue_by_type[tx_queue->type] = tx_queue;
 	return 0;

 fail2:
@@ -85,11 +86,7 @@ void efx_init_tx_queue(struct efx_tx_queue *tx_queue)
 	tx_queue->completed_timestamp_minor = 0;

 	tx_queue->xdp_tx = efx_channel_is_xdp_tx(tx_queue->channel);
-
-	/* Set up default function pointers. These may get replaced by
-	 * efx_nic_init_tx() based off NIC/queue capabilities.
-	 */
-	tx_queue->handle_tso = efx_enqueue_skb_tso;
+	tx_queue->tso_version = 0;

 	/* Set up TX descriptor ring */
 	efx_nic_init_tx(tx_queue);
@@ -141,6 +138,7 @@ void efx_remove_tx_queue(struct efx_tx_queue *tx_queue)

 	kfree(tx_queue->buffer);
 	tx_queue->buffer = NULL;
+	tx_queue->channel->tx_queue_by_type[tx_queue->type] = NULL;
 }

 void efx_dequeue_buffer(struct efx_tx_queue *tx_queue,