Commit fddb8872 authored by David S. Miller's avatar David S. Miller

Merge branch 'sw_tso'

Ezequiel Garcia says:

====================
net: Introduce a software TSO helper API

Here's a first proposal for a generic software TSO helper API, following
David's suggestion.

There are at least two drivers that currently implement some form of software
TSO: Solarflare network driver (drivers/net/ethernet/sfc) and Tilera GX
network driver (drivers/net/ethernet/tile/tilegx.c).

The rationale behind adding a generic API is to provide a boiler plate with the
segmentation and other common tasks, making this support easier to add in other
drivers.

When designing the API, I've considered mainly two design choices:

  1. Implement a series of callbacks that each driver would implement
     and the net core code would call to fill in descriptors and egress
     that data.

  2. Implement an API for drivers to use in a driver's specific tx_tso
     function. This functions would exhaust a sk_buff payload, and use the
     API as helper for building the headers and segmented data.

I've chosen (2), to avoid function pointers (which was Willy's concern) and
because it seemed less fragile. Of course, this is argueable.

The API is by no means complete, and lacks some features, however it allows
to support TSO in mv643xx_eth and mvneta network drivers with some very
good performance results. I've added this support as an example of the API
in action.

In particular the following needs some revisiting:

  1. IPv6 support is lacking.

  2. The required descriptor counting needs some verification. The current
     implementation might be too "sketchy". The tilegx one can be a good
     starting point.

  3. The implemenation assumes the hardware can compute the TCP and IP
     checksums for the built headers. However, some controllers may need
     some initial calculation (such as tilegx, for instance).

Despite this, I hope this proposal is good enough to trigger some discussion
and to check if I'm on the right track. Feedback is much appreciated!
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8af750d7 3ae8f4e0
This diff is collapsed.
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include <net/ip.h> #include <net/ip.h>
#include <net/ipv6.h> #include <net/ipv6.h>
#include <linux/io.h> #include <linux/io.h>
#include <net/tso.h>
#include <linux/of.h> #include <linux/of.h>
#include <linux/of_irq.h> #include <linux/of_irq.h>
#include <linux/of_mdio.h> #include <linux/of_mdio.h>
...@@ -244,6 +245,9 @@ ...@@ -244,6 +245,9 @@
#define MVNETA_TX_MTU_MAX 0x3ffff #define MVNETA_TX_MTU_MAX 0x3ffff
/* TSO header size */
#define TSO_HEADER_SIZE 128
/* Max number of Rx descriptors */ /* Max number of Rx descriptors */
#define MVNETA_MAX_RXD 128 #define MVNETA_MAX_RXD 128
...@@ -413,6 +417,12 @@ struct mvneta_tx_queue { ...@@ -413,6 +417,12 @@ struct mvneta_tx_queue {
/* Index of the next TX DMA descriptor to process */ /* Index of the next TX DMA descriptor to process */
int next_desc_to_proc; int next_desc_to_proc;
/* DMA buffers for TSO headers */
char *tso_hdrs;
/* DMA address of TSO headers */
dma_addr_t tso_hdrs_phys;
}; };
struct mvneta_rx_queue { struct mvneta_rx_queue {
...@@ -1519,6 +1529,126 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo, ...@@ -1519,6 +1529,126 @@ static int mvneta_rx(struct mvneta_port *pp, int rx_todo,
return rx_done; return rx_done;
} }
static inline void
mvneta_tso_put_hdr(struct sk_buff *skb,
struct mvneta_port *pp, struct mvneta_tx_queue *txq)
{
struct mvneta_tx_desc *tx_desc;
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
txq->tx_skb[txq->txq_put_index] = NULL;
tx_desc = mvneta_txq_next_desc_get(txq);
tx_desc->data_size = hdr_len;
tx_desc->command = mvneta_skb_tx_csum(pp, skb);
tx_desc->command |= MVNETA_TXD_F_DESC;
tx_desc->buf_phys_addr = txq->tso_hdrs_phys +
txq->txq_put_index * TSO_HEADER_SIZE;
mvneta_txq_inc_put(txq);
}
static inline int
mvneta_tso_put_data(struct net_device *dev, struct mvneta_tx_queue *txq,
struct sk_buff *skb, char *data, int size,
bool last_tcp, bool is_last)
{
struct mvneta_tx_desc *tx_desc;
tx_desc = mvneta_txq_next_desc_get(txq);
tx_desc->data_size = size;
tx_desc->buf_phys_addr = dma_map_single(dev->dev.parent, data,
size, DMA_TO_DEVICE);
if (unlikely(dma_mapping_error(dev->dev.parent,
tx_desc->buf_phys_addr))) {
mvneta_txq_desc_put(txq);
return -ENOMEM;
}
tx_desc->command = 0;
txq->tx_skb[txq->txq_put_index] = NULL;
if (last_tcp) {
/* last descriptor in the TCP packet */
tx_desc->command = MVNETA_TXD_L_DESC;
/* last descriptor in SKB */
if (is_last)
txq->tx_skb[txq->txq_put_index] = skb;
}
mvneta_txq_inc_put(txq);
return 0;
}
static int mvneta_tx_tso(struct sk_buff *skb, struct net_device *dev,
struct mvneta_tx_queue *txq)
{
int total_len, data_left;
int desc_count = 0;
struct mvneta_port *pp = netdev_priv(dev);
struct tso_t tso;
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
int i;
/* Count needed descriptors */
if ((txq->count + tso_count_descs(skb)) >= txq->size)
return 0;
if (skb_headlen(skb) < (skb_transport_offset(skb) + tcp_hdrlen(skb))) {
pr_info("*** Is this even possible???!?!?\n");
return 0;
}
/* Initialize the TSO handler, and prepare the first payload */
tso_start(skb, &tso);
total_len = skb->len - hdr_len;
while (total_len > 0) {
char *hdr;
data_left = min_t(int, skb_shinfo(skb)->gso_size, total_len);
total_len -= data_left;
desc_count++;
/* prepare packet headers: MAC + IP + TCP */
hdr = txq->tso_hdrs + txq->txq_put_index * TSO_HEADER_SIZE;
tso_build_hdr(skb, hdr, &tso, data_left, total_len == 0);
mvneta_tso_put_hdr(skb, pp, txq);
while (data_left > 0) {
int size;
desc_count++;
size = min_t(int, tso.size, data_left);
if (mvneta_tso_put_data(dev, txq, skb,
tso.data, size,
size == data_left,
total_len == 0))
goto err_release;
data_left -= size;
tso_build_data(skb, &tso, size);
}
}
return desc_count;
err_release:
/* Release all used data descriptors; header descriptors must not
* be DMA-unmapped.
*/
for (i = desc_count - 1; i >= 0; i--) {
struct mvneta_tx_desc *tx_desc = txq->descs + i;
if (!(tx_desc->command & MVNETA_TXD_F_DESC))
dma_unmap_single(pp->dev->dev.parent,
tx_desc->buf_phys_addr,
tx_desc->data_size,
DMA_TO_DEVICE);
mvneta_txq_desc_put(txq);
}
return 0;
}
/* Handle tx fragmentation processing */ /* Handle tx fragmentation processing */
static int mvneta_tx_frag_process(struct mvneta_port *pp, struct sk_buff *skb, static int mvneta_tx_frag_process(struct mvneta_port *pp, struct sk_buff *skb,
struct mvneta_tx_queue *txq) struct mvneta_tx_queue *txq)
...@@ -1584,15 +1714,18 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) ...@@ -1584,15 +1714,18 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev)
u16 txq_id = skb_get_queue_mapping(skb); u16 txq_id = skb_get_queue_mapping(skb);
struct mvneta_tx_queue *txq = &pp->txqs[txq_id]; struct mvneta_tx_queue *txq = &pp->txqs[txq_id];
struct mvneta_tx_desc *tx_desc; struct mvneta_tx_desc *tx_desc;
struct netdev_queue *nq;
int frags = 0; int frags = 0;
u32 tx_cmd; u32 tx_cmd;
if (!netif_running(dev)) if (!netif_running(dev))
goto out; goto out;
if (skb_is_gso(skb)) {
frags = mvneta_tx_tso(skb, dev, txq);
goto out;
}
frags = skb_shinfo(skb)->nr_frags + 1; frags = skb_shinfo(skb)->nr_frags + 1;
nq = netdev_get_tx_queue(dev, txq_id);
/* Get a descriptor for the first part of the packet */ /* Get a descriptor for the first part of the packet */
tx_desc = mvneta_txq_next_desc_get(txq); tx_desc = mvneta_txq_next_desc_get(txq);
...@@ -1635,16 +1768,17 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev) ...@@ -1635,16 +1768,17 @@ static int mvneta_tx(struct sk_buff *skb, struct net_device *dev)
} }
} }
out:
if (frags > 0) {
struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
txq->count += frags; txq->count += frags;
mvneta_txq_pend_desc_add(pp, txq, frags); mvneta_txq_pend_desc_add(pp, txq, frags);
if (txq->size - txq->count < MAX_SKB_FRAGS + 1) if (txq->size - txq->count < MAX_SKB_FRAGS + 1)
netif_tx_stop_queue(nq); netif_tx_stop_queue(nq);
out:
if (frags > 0) {
struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
u64_stats_update_begin(&stats->syncp); u64_stats_update_begin(&stats->syncp);
stats->tx_packets++; stats->tx_packets++;
stats->tx_bytes += skb->len; stats->tx_bytes += skb->len;
...@@ -2109,6 +2243,18 @@ static int mvneta_txq_init(struct mvneta_port *pp, ...@@ -2109,6 +2243,18 @@ static int mvneta_txq_init(struct mvneta_port *pp,
txq->descs, txq->descs_phys); txq->descs, txq->descs_phys);
return -ENOMEM; return -ENOMEM;
} }
/* Allocate DMA buffers for TSO MAC/IP/TCP headers */
txq->tso_hdrs = dma_alloc_coherent(pp->dev->dev.parent,
txq->size * TSO_HEADER_SIZE,
&txq->tso_hdrs_phys, GFP_KERNEL);
if (txq->tso_hdrs == NULL) {
kfree(txq->tx_skb);
dma_free_coherent(pp->dev->dev.parent,
txq->size * MVNETA_DESC_ALIGNED_SIZE,
txq->descs, txq->descs_phys);
return -ENOMEM;
}
mvneta_tx_done_pkts_coal_set(pp, txq, txq->done_pkts_coal); mvneta_tx_done_pkts_coal_set(pp, txq, txq->done_pkts_coal);
return 0; return 0;
...@@ -2120,6 +2266,10 @@ static void mvneta_txq_deinit(struct mvneta_port *pp, ...@@ -2120,6 +2266,10 @@ static void mvneta_txq_deinit(struct mvneta_port *pp,
{ {
kfree(txq->tx_skb); kfree(txq->tx_skb);
if (txq->tso_hdrs)
dma_free_coherent(pp->dev->dev.parent,
txq->size * TSO_HEADER_SIZE,
txq->tso_hdrs, txq->tso_hdrs_phys);
if (txq->descs) if (txq->descs)
dma_free_coherent(pp->dev->dev.parent, dma_free_coherent(pp->dev->dev.parent,
txq->size * MVNETA_DESC_ALIGNED_SIZE, txq->size * MVNETA_DESC_ALIGNED_SIZE,
...@@ -2895,9 +3045,9 @@ static int mvneta_probe(struct platform_device *pdev) ...@@ -2895,9 +3045,9 @@ static int mvneta_probe(struct platform_device *pdev)
netif_napi_add(dev, &pp->napi, mvneta_poll, pp->weight); netif_napi_add(dev, &pp->napi, mvneta_poll, pp->weight);
dev->features = NETIF_F_SG | NETIF_F_IP_CSUM; dev->features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
dev->hw_features |= NETIF_F_SG | NETIF_F_IP_CSUM; dev->hw_features |= dev->features;
dev->vlan_features |= NETIF_F_SG | NETIF_F_IP_CSUM; dev->vlan_features |= dev->features;
dev->priv_flags |= IFF_UNICAST_FLT; dev->priv_flags |= IFF_UNICAST_FLT;
err = register_netdev(dev); err = register_netdev(dev);
......
#ifndef _TSO_H
#define _TSO_H
#include <net/ip.h>
struct tso_t {
int next_frag_idx;
void *data;
size_t size;
u16 ip_id;
u32 tcp_seq;
};
int tso_count_descs(struct sk_buff *skb);
void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last);
void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size);
void tso_start(struct sk_buff *skb, struct tso_t *tso);
#endif /* _TSO_H */
...@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o ...@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
sock_diag.o dev_ioctl.o sock_diag.o dev_ioctl.o tso.o
obj-$(CONFIG_XFRM) += flow.o obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o obj-y += net-sysfs.o
......
#include <net/ip.h>
#include <net/tso.h>
/* Calculate expected number of TX descriptors */
int tso_count_descs(struct sk_buff *skb)
{
/* The Marvell Way */
return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
}
void tso_build_hdr(struct sk_buff *skb, char *hdr, struct tso_t *tso,
int size, bool is_last)
{
struct iphdr *iph;
struct tcphdr *tcph;
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
int mac_hdr_len = skb_network_offset(skb);
memcpy(hdr, skb->data, hdr_len);
iph = (struct iphdr *)(hdr + mac_hdr_len);
iph->id = htons(tso->ip_id);
iph->tot_len = htons(size + hdr_len - mac_hdr_len);
tcph = (struct tcphdr *)(hdr + skb_transport_offset(skb));
tcph->seq = htonl(tso->tcp_seq);
tso->ip_id++;
if (!is_last) {
/* Clear all special flags for not last packet */
tcph->psh = 0;
tcph->fin = 0;
tcph->rst = 0;
}
}
void tso_build_data(struct sk_buff *skb, struct tso_t *tso, int size)
{
tso->tcp_seq += size;
tso->size -= size;
tso->data += size;
if ((tso->size == 0) &&
(tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
tso->size = frag->size;
tso->data = page_address(frag->page.p) + frag->page_offset;
tso->next_frag_idx++;
}
}
void tso_start(struct sk_buff *skb, struct tso_t *tso)
{
int hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
tso->ip_id = ntohs(ip_hdr(skb)->id);
tso->tcp_seq = ntohl(tcp_hdr(skb)->seq);
tso->next_frag_idx = 0;
/* Build first data */
tso->size = skb_headlen(skb) - hdr_len;
tso->data = skb->data + hdr_len;
if ((tso->size == 0) &&
(tso->next_frag_idx < skb_shinfo(skb)->nr_frags)) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[tso->next_frag_idx];
/* Move to next segment */
tso->size = frag->size;
tso->data = page_address(frag->page.p) + frag->page_offset;
tso->next_frag_idx++;
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment