Commit fc791b63 authored by Paolo Abeni's avatar Paolo Abeni Committed by David S. Miller

IB/ipoib: move back IB LL address into the hard header

After the commit 9207f9d4 ("net: preserve IP control block
during GSO segmentation"), the GSO CB and the IPoIB CB conflict.
That destroy the IPoIB address information cached there,
causing a severe performance regression, as better described here:

http://marc.info/?l=linux-kernel&m=146787279825501&w=2

This change moves the data cached by the IPoIB driver from the
skb control lock into the IPoIB hard header, as done before
the commit 936d7de3 ("IPoIB: Stop lying about hard_header_len
and use skb->cb to stash LL addresses").
In order to avoid GRO issue, on packet reception, the IPoIB driver
stash into the skb a dummy pseudo header, so that the received
packets have actually a hard header matching the declared length.
To avoid changing the connected mode maximum mtu, the allocated
head buffer size is increased by the pseudo header length.

After this commit, IPoIB performances are back to pre-regression
value.

v2 -> v3: rebased
v1 -> v2: avoid changing the max mtu, increasing the head buf size

Fixes: 9207f9d4 ("net: preserve IP control block during GSO segmentation")
Signed-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f1f081ce
...@@ -63,6 +63,8 @@ enum ipoib_flush_level { ...@@ -63,6 +63,8 @@ enum ipoib_flush_level {
enum { enum {
IPOIB_ENCAP_LEN = 4, IPOIB_ENCAP_LEN = 4,
IPOIB_PSEUDO_LEN = 20,
IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN, IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */ IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */
...@@ -134,15 +136,21 @@ struct ipoib_header { ...@@ -134,15 +136,21 @@ struct ipoib_header {
u16 reserved; u16 reserved;
}; };
struct ipoib_cb { struct ipoib_pseudo_header {
struct qdisc_skb_cb qdisc_cb; u8 hwaddr[INFINIBAND_ALEN];
u8 hwaddr[INFINIBAND_ALEN];
}; };
static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb) static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
{ {
BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb)); char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
return (struct ipoib_cb *)skb->cb;
/*
* only the ipoib header is present now, make room for a dummy
* pseudo header and set skb field accordingly
*/
memset(data, 0, IPOIB_PSEUDO_LEN);
skb_reset_mac_header(skb);
skb_pull(skb, IPOIB_HARD_LEN);
} }
/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */ /* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
......
...@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level, ...@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level,
#define IPOIB_CM_RX_DELAY (3 * 256 * HZ) #define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
#define IPOIB_CM_RX_UPDATE_MASK (0x3) #define IPOIB_CM_RX_UPDATE_MASK (0x3)
#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
static struct ib_qp_attr ipoib_cm_err_attr = { static struct ib_qp_attr ipoib_cm_err_attr = {
.qp_state = IB_QPS_ERR .qp_state = IB_QPS_ERR
}; };
...@@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, ...@@ -146,15 +148,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
struct sk_buff *skb; struct sk_buff *skb;
int i; int i;
skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12); skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
if (unlikely(!skb)) if (unlikely(!skb))
return NULL; return NULL;
/* /*
* IPoIB adds a 4 byte header. So we need 12 more bytes to align the * IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
* IP header to a multiple of 16. * IP header to a multiple of 16.
*/ */
skb_reserve(skb, 12); skb_reserve(skb, IPOIB_CM_RX_RESERVE);
mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE, mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
DMA_FROM_DEVICE); DMA_FROM_DEVICE);
...@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
if (wc->byte_len < IPOIB_CM_COPYBREAK) { if (wc->byte_len < IPOIB_CM_COPYBREAK) {
int dlen = wc->byte_len; int dlen = wc->byte_len;
small_skb = dev_alloc_skb(dlen + 12); small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
if (small_skb) { if (small_skb) {
skb_reserve(small_skb, 12); skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0], ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
dlen, DMA_FROM_DEVICE); dlen, DMA_FROM_DEVICE);
skb_copy_from_linear_data(skb, small_skb->data, dlen); skb_copy_from_linear_data(skb, small_skb->data, dlen);
...@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
copied: copied:
skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb->protocol = ((struct ipoib_header *) skb->data)->proto;
skb_reset_mac_header(skb); skb_add_pseudo_hdr(skb);
skb_pull(skb, IPOIB_ENCAP_LEN);
++dev->stats.rx_packets; ++dev->stats.rx_packets;
dev->stats.rx_bytes += skb->len; dev->stats.rx_bytes += skb->len;
......
...@@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id) ...@@ -128,16 +128,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu); buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN); skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
if (unlikely(!skb)) if (unlikely(!skb))
return NULL; return NULL;
/* /*
* IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte * the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
* header. So we need 4 more bytes to get to 48 and align the * 64 bytes aligned
* IP header to a multiple of 16.
*/ */
skb_reserve(skb, 4); skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
mapping = priv->rx_ring[id].mapping; mapping = priv->rx_ring[id].mapping;
mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size, mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
...@@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ...@@ -253,8 +252,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
skb_pull(skb, IB_GRH_BYTES); skb_pull(skb, IB_GRH_BYTES);
skb->protocol = ((struct ipoib_header *) skb->data)->proto; skb->protocol = ((struct ipoib_header *) skb->data)->proto;
skb_reset_mac_header(skb); skb_add_pseudo_hdr(skb);
skb_pull(skb, IPOIB_ENCAP_LEN);
++dev->stats.rx_packets; ++dev->stats.rx_packets;
dev->stats.rx_bytes += skb->len; dev->stats.rx_bytes += skb->len;
......
...@@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, ...@@ -925,9 +925,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
ipoib_neigh_free(neigh); ipoib_neigh_free(neigh);
goto err_drop; goto err_drop;
} }
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) if (skb_queue_len(&neigh->queue) <
IPOIB_MAX_PATH_REC_QUEUE) {
/* put pseudoheader back on for next time */
skb_push(skb, IPOIB_PSEUDO_LEN);
__skb_queue_tail(&neigh->queue, skb); __skb_queue_tail(&neigh->queue, skb);
else { } else {
ipoib_warn(priv, "queue length limit %d. Packet drop.\n", ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
skb_queue_len(&neigh->queue)); skb_queue_len(&neigh->queue));
goto err_drop; goto err_drop;
...@@ -964,7 +967,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr, ...@@ -964,7 +967,7 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
} }
static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
struct ipoib_cb *cb) struct ipoib_pseudo_header *phdr)
{ {
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_path *path; struct ipoib_path *path;
...@@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, ...@@ -972,16 +975,18 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
spin_lock_irqsave(&priv->lock, flags); spin_lock_irqsave(&priv->lock, flags);
path = __path_find(dev, cb->hwaddr + 4); path = __path_find(dev, phdr->hwaddr + 4);
if (!path || !path->valid) { if (!path || !path->valid) {
int new_path = 0; int new_path = 0;
if (!path) { if (!path) {
path = path_rec_create(dev, cb->hwaddr + 4); path = path_rec_create(dev, phdr->hwaddr + 4);
new_path = 1; new_path = 1;
} }
if (path) { if (path) {
if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
/* put pseudoheader back on for next time */
skb_push(skb, IPOIB_PSEUDO_LEN);
__skb_queue_tail(&path->queue, skb); __skb_queue_tail(&path->queue, skb);
} else { } else {
++dev->stats.tx_dropped; ++dev->stats.tx_dropped;
...@@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev, ...@@ -1009,10 +1014,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
be16_to_cpu(path->pathrec.dlid)); be16_to_cpu(path->pathrec.dlid));
spin_unlock_irqrestore(&priv->lock, flags); spin_unlock_irqrestore(&priv->lock, flags);
ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr)); ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
return; return;
} else if ((path->query || !path_rec_start(dev, path)) && } else if ((path->query || !path_rec_start(dev, path)) &&
skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) { skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
/* put pseudoheader back on for next time */
skb_push(skb, IPOIB_PSEUDO_LEN);
__skb_queue_tail(&path->queue, skb); __skb_queue_tail(&path->queue, skb);
} else { } else {
++dev->stats.tx_dropped; ++dev->stats.tx_dropped;
...@@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -1026,13 +1033,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
{ {
struct ipoib_dev_priv *priv = netdev_priv(dev); struct ipoib_dev_priv *priv = netdev_priv(dev);
struct ipoib_neigh *neigh; struct ipoib_neigh *neigh;
struct ipoib_cb *cb = ipoib_skb_cb(skb); struct ipoib_pseudo_header *phdr;
struct ipoib_header *header; struct ipoib_header *header;
unsigned long flags; unsigned long flags;
phdr = (struct ipoib_pseudo_header *) skb->data;
skb_pull(skb, sizeof(*phdr));
header = (struct ipoib_header *) skb->data; header = (struct ipoib_header *) skb->data;
if (unlikely(cb->hwaddr[4] == 0xff)) { if (unlikely(phdr->hwaddr[4] == 0xff)) {
/* multicast, arrange "if" according to probability */ /* multicast, arrange "if" according to probability */
if ((header->proto != htons(ETH_P_IP)) && if ((header->proto != htons(ETH_P_IP)) &&
(header->proto != htons(ETH_P_IPV6)) && (header->proto != htons(ETH_P_IPV6)) &&
...@@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -1045,13 +1054,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
return NETDEV_TX_OK; return NETDEV_TX_OK;
} }
/* Add in the P_Key for multicast*/ /* Add in the P_Key for multicast*/
cb->hwaddr[8] = (priv->pkey >> 8) & 0xff; phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
cb->hwaddr[9] = priv->pkey & 0xff; phdr->hwaddr[9] = priv->pkey & 0xff;
neigh = ipoib_neigh_get(dev, cb->hwaddr); neigh = ipoib_neigh_get(dev, phdr->hwaddr);
if (likely(neigh)) if (likely(neigh))
goto send_using_neigh; goto send_using_neigh;
ipoib_mcast_send(dev, cb->hwaddr, skb); ipoib_mcast_send(dev, phdr->hwaddr, skb);
return NETDEV_TX_OK; return NETDEV_TX_OK;
} }
...@@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -1060,16 +1069,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
case htons(ETH_P_IP): case htons(ETH_P_IP):
case htons(ETH_P_IPV6): case htons(ETH_P_IPV6):
case htons(ETH_P_TIPC): case htons(ETH_P_TIPC):
neigh = ipoib_neigh_get(dev, cb->hwaddr); neigh = ipoib_neigh_get(dev, phdr->hwaddr);
if (unlikely(!neigh)) { if (unlikely(!neigh)) {
neigh_add_path(skb, cb->hwaddr, dev); neigh_add_path(skb, phdr->hwaddr, dev);
return NETDEV_TX_OK; return NETDEV_TX_OK;
} }
break; break;
case htons(ETH_P_ARP): case htons(ETH_P_ARP):
case htons(ETH_P_RARP): case htons(ETH_P_RARP):
/* for unicast ARP and RARP should always perform path find */ /* for unicast ARP and RARP should always perform path find */
unicast_arp_send(skb, dev, cb); unicast_arp_send(skb, dev, phdr);
return NETDEV_TX_OK; return NETDEV_TX_OK;
default: default:
/* ethertype not supported by IPoIB */ /* ethertype not supported by IPoIB */
...@@ -1086,11 +1095,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -1086,11 +1095,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
goto unref; goto unref;
} }
} else if (neigh->ah) { } else if (neigh->ah) {
ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr)); ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
goto unref; goto unref;
} }
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) { if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof(*phdr));
spin_lock_irqsave(&priv->lock, flags); spin_lock_irqsave(&priv->lock, flags);
__skb_queue_tail(&neigh->queue, skb); __skb_queue_tail(&neigh->queue, skb);
spin_unlock_irqrestore(&priv->lock, flags); spin_unlock_irqrestore(&priv->lock, flags);
...@@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_buff *skb, ...@@ -1122,8 +1133,8 @@ static int ipoib_hard_header(struct sk_buff *skb,
unsigned short type, unsigned short type,
const void *daddr, const void *saddr, unsigned len) const void *daddr, const void *saddr, unsigned len)
{ {
struct ipoib_pseudo_header *phdr;
struct ipoib_header *header; struct ipoib_header *header;
struct ipoib_cb *cb = ipoib_skb_cb(skb);
header = (struct ipoib_header *) skb_push(skb, sizeof *header); header = (struct ipoib_header *) skb_push(skb, sizeof *header);
...@@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_buff *skb, ...@@ -1132,12 +1143,13 @@ static int ipoib_hard_header(struct sk_buff *skb,
/* /*
* we don't rely on dst_entry structure, always stuff the * we don't rely on dst_entry structure, always stuff the
* destination address into skb->cb so we can figure out where * destination address into skb hard header so we can figure out where
* to send the packet later. * to send the packet later.
*/ */
memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN); phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr));
memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
return sizeof *header; return IPOIB_HARD_LEN;
} }
static void ipoib_set_mcast_list(struct net_device *dev) static void ipoib_set_mcast_list(struct net_device *dev)
...@@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev) ...@@ -1759,7 +1771,7 @@ void ipoib_setup(struct net_device *dev)
dev->flags |= IFF_BROADCAST | IFF_MULTICAST; dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
dev->hard_header_len = IPOIB_ENCAP_LEN; dev->hard_header_len = IPOIB_HARD_LEN;
dev->addr_len = INFINIBAND_ALEN; dev->addr_len = INFINIBAND_ALEN;
dev->type = ARPHRD_INFINIBAND; dev->type = ARPHRD_INFINIBAND;
dev->tx_queue_len = ipoib_sendq_size * 2; dev->tx_queue_len = ipoib_sendq_size * 2;
......
...@@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) ...@@ -796,9 +796,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
__ipoib_mcast_add(dev, mcast); __ipoib_mcast_add(dev, mcast);
list_add_tail(&mcast->list, &priv->multicast_list); list_add_tail(&mcast->list, &priv->multicast_list);
} }
if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
/* put pseudoheader back on for next time */
skb_push(skb, sizeof(struct ipoib_pseudo_header));
skb_queue_tail(&mcast->pkt_queue, skb); skb_queue_tail(&mcast->pkt_queue, skb);
else { } else {
++dev->stats.tx_dropped; ++dev->stats.tx_dropped;
dev_kfree_skb_any(skb); dev_kfree_skb_any(skb);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment