Commit c1e60bd4 authored by David S. Miller's avatar David S. Miller

Merge branch 'csums-next'

Tom Herbert says:

====================
net: Checksum offload changes - Part V

I am working on overhauling RX checksum offload. Goals of this effort
are:

- Specify what exactly it means when driver returns CHECKSUM_UNNECESSARY
- Preserve CHECKSUM_COMPLETE through encapsulation layers
- Don't do skb_checksum more than once per packet
- Unify GRO and non-GRO csum verification as much as possible
- Unify the checksum functions (checksum_init)
- Simplify code

What is in this fifth patch set:

- Added GRO checksum validation functions
- Call the GRO validations functions from TCP and GRE gro_receive
- Perform checksum verification in the UDP gro_receive path using
  GRO functions and add support for gro_receive in UDP6

Changes in V2:

- Change ip_summed to CHECKSUM_UNNECESSARY instead of moving it
  to CHECKSUM_COMPLETE from GRO checksum validation. This avoids
  performance penalty in checksumming bytes which are before the header
  GRO is at.

Please review carefully and test if possible, mucking with basic
checksum functions is always a little precarious :-)

----

Test results with this patch set are below. I did not notice any
performace regression.

Tests run:
   TCP_STREAM: super_netperf with 200 streams
   TCP_RR: super_netperf with 200 streams and -r 1,1

Device bnx2x (10Gbps):
   No GRE RSS hash (RX interrupts occur on one core)
   UDP RSS port hashing enabled.

* GRE with checksum with IPv4 encapsulated packets
  With fix:
    TCP_STREAM
        9.91% CPU utilization
        5163.78 Mbps
    TCP_RR
        50.64% CPU utilization
        219/347/502 90/95/99% latencies
        834103 tps
  Without fix:
    TCP_STREAM
        10.05% CPU utilization
        5186.22 tps
    TCP_RR
        49.70% CPU utilization
        227/338/486 90/95/99% latencies
        813450 tps

* GRE without checksum with IPv4 encapsulated packets
  With fix:
    TCP_STREAM
        10.18% CPU utilization
        5159 Mbps
    TCP_RR
        51.86% CPU utilization
        214/325/471 90/95/99% latencies
        865943 tps
  Without fix:
    TCP_STREAM
        10.26% CPU utilization
        5307.87 Mbps
    TCP_RR
        50.59% CPU utilization
        224/325/476 90/95/99% latencies
        846429 tps

*** Simulate device returns CHECKSUM_COMPLETE

* VXLAN with checksum
  With fix:
    TCP_STREAM
        13.03% CPU utilization
        9093.9 Mbps
    TCP_RR
        95.96% CPU utilization
        161/259/474 90/95/99% latencies
        1.14806e+06 tps
  Without fix:
    TCP_STREAM
        13.59% CPU utilization
        9093.97 Mbps
    TCP_RR
        93.95% CPU utilization
        160/259/484 90/95/99% latencies
        1.10262e+06 tps

* VXLAN without checksum
  With fix:
    TCP_STREAM
        13.28% CPU utilization
        9093.87 Mbps
    TCP_RR
        95.04% CPU utilization
        155/246/439 90/95/99% latencies
        1.15e+06 tps
  Without fix:
    TCP_STREAM
        13.37% CPU utilization
        9178.45 Mbps
    TCP_RR
        93.74% CPU utilization
        161/257/469 90/95/99% latencies
        1.1068e+06 Mbps
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8fc54f68 48a5fc77
......@@ -1883,7 +1883,13 @@ struct napi_gro_cb {
u16 proto;
/* Used in udp_gro_receive */
u16 udp_mark;
u8 udp_mark:1;
/* GRO checksum is valid */
u8 csum_valid:1;
/* Number encapsulation layers crossed */
u8 encapsulation;
/* used to support CHECKSUM_COMPLETE for tunneling protocols */
__wsum csum;
......@@ -2154,11 +2160,77 @@ static inline void *skb_gro_network_header(struct sk_buff *skb)
static inline void skb_gro_postpull_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
{
if (skb->ip_summed == CHECKSUM_COMPLETE)
if (NAPI_GRO_CB(skb)->csum_valid)
NAPI_GRO_CB(skb)->csum = csum_sub(NAPI_GRO_CB(skb)->csum,
csum_partial(start, len, 0));
}
/* GRO checksum functions. These are logical equivalents of the normal
* checksum functions (in skbuff.h) except that they operate on the GRO
* offsets and fields in sk_buff.
*/
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb);
static inline bool __skb_gro_checksum_validate_needed(struct sk_buff *skb,
bool zero_okay,
__sum16 check)
{
return (skb->ip_summed != CHECKSUM_PARTIAL &&
(skb->ip_summed != CHECKSUM_UNNECESSARY ||
(NAPI_GRO_CB(skb)->encapsulation > skb->encapsulation)) &&
(!zero_okay || check));
}
static inline __sum16 __skb_gro_checksum_validate_complete(struct sk_buff *skb,
__wsum psum)
{
if (NAPI_GRO_CB(skb)->csum_valid &&
!csum_fold(csum_add(psum, NAPI_GRO_CB(skb)->csum)))
return 0;
NAPI_GRO_CB(skb)->csum = psum;
return __skb_gro_checksum_complete(skb);
}
/* Update skb for CHECKSUM_UNNECESSARY when we verified a top level
* checksum or an encapsulated one during GRO. This saves work
* if we fallback to normal path with the packet.
*/
static inline void skb_gro_incr_csum_unnecessary(struct sk_buff *skb)
{
if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
if (NAPI_GRO_CB(skb)->encapsulation)
skb->encapsulation = 1;
} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->encapsulation = 0;
}
}
#define __skb_gro_checksum_validate(skb, proto, zero_okay, check, \
compute_pseudo) \
({ \
__sum16 __ret = 0; \
if (__skb_gro_checksum_validate_needed(skb, zero_okay, check)) \
__ret = __skb_gro_checksum_validate_complete(skb, \
compute_pseudo(skb, proto)); \
if (!__ret) \
skb_gro_incr_csum_unnecessary(skb); \
__ret; \
})
#define skb_gro_checksum_validate(skb, proto, compute_pseudo) \
__skb_gro_checksum_validate(skb, proto, false, 0, compute_pseudo)
#define skb_gro_checksum_validate_zero_check(skb, proto, check, \
compute_pseudo) \
__skb_gro_checksum_validate(skb, proto, true, check, compute_pseudo)
#define skb_gro_checksum_simple_validate(skb) \
__skb_gro_checksum_validate(skb, 0, false, 0, null_compute_pseudo)
static inline int dev_hard_header(struct sk_buff *skb, struct net_device *dev,
unsigned short type,
const void *daddr, const void *saddr,
......
......@@ -364,6 +364,14 @@ static inline void inet_set_txhash(struct sock *sk)
sk->sk_txhash = flow_hash_from_keys(&keys);
}
static inline __wsum inet_gro_compute_pseudo(struct sk_buff *skb, int proto)
{
const struct iphdr *iph = skb_gro_network_header(skb);
return csum_tcpudp_nofold(iph->saddr, iph->daddr,
skb_gro_len(skb), proto, 0);
}
/*
* Map a multicast IP onto multicast MAC for type ethernet.
*/
......
......@@ -48,6 +48,14 @@ static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto)
skb->len, proto, 0));
}
static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto)
{
const struct ipv6hdr *iph = skb_gro_network_header(skb);
return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr,
skb_gro_len(skb), proto, 0));
}
static __inline__ __sum16 tcp_v6_check(int len,
const struct in6_addr *saddr,
const struct in6_addr *daddr,
......
......@@ -158,6 +158,24 @@ static inline __sum16 udp_v4_check(int len, __be32 saddr,
void udp_set_csum(bool nocheck, struct sk_buff *skb,
__be32 saddr, __be32 daddr, int len);
struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
struct udphdr *uh);
int udp_gro_complete(struct sk_buff *skb, int nhoff);
static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
{
struct udphdr *uh;
unsigned int hlen, off;
off = skb_gro_offset(skb);
hlen = off + sizeof(*uh);
uh = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen))
uh = skb_gro_header_slow(skb, hlen, off);
return uh;
}
/* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
static inline void udp_lib_hash(struct sock *sk)
{
......
......@@ -3962,7 +3962,13 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
goto normal;
gro_list_prepare(napi, skb);
NAPI_GRO_CB(skb)->csum = skb->csum; /* Needed for CHECKSUM_COMPLETE */
if (skb->ip_summed == CHECKSUM_COMPLETE) {
NAPI_GRO_CB(skb)->csum = skb->csum;
NAPI_GRO_CB(skb)->csum_valid = 1;
} else {
NAPI_GRO_CB(skb)->csum_valid = 0;
}
rcu_read_lock();
list_for_each_entry_rcu(ptype, head, list) {
......@@ -3975,6 +3981,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
NAPI_GRO_CB(skb)->udp_mark = 0;
NAPI_GRO_CB(skb)->encapsulation = 0;
pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
break;
......@@ -4205,6 +4212,31 @@ gro_result_t napi_gro_frags(struct napi_struct *napi)
}
EXPORT_SYMBOL(napi_gro_frags);
/* Compute the checksum from gro_offset and return the folded value
* after adding in any pseudo checksum.
*/
__sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
{
__wsum wsum;
__sum16 sum;
wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
/* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
if (likely(!sum)) {
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
!skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
}
NAPI_GRO_CB(skb)->csum = wsum;
NAPI_GRO_CB(skb)->csum_valid = 1;
return sum;
}
EXPORT_SYMBOL(__skb_gro_checksum_complete);
/*
* net_rps_action_and_irq_enable sends any pending IPI's for rps.
* Note: called with local irq disabled, but exits with local irq enabled.
......
......@@ -125,6 +125,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
*csum_err = true;
return -EINVAL;
}
skb_pop_rcv_encapsulation(skb);
options++;
}
......
......@@ -119,28 +119,6 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
return segs;
}
/* Compute the whole skb csum in s/w and store it, then verify GRO csum
* starting from gro_offset.
*/
static __sum16 gro_skb_checksum(struct sk_buff *skb)
{
__sum16 sum;
skb->csum = skb_checksum(skb, 0, skb->len, 0);
NAPI_GRO_CB(skb)->csum = csum_sub(skb->csum,
csum_partial(skb->data, skb_gro_offset(skb), 0));
sum = csum_fold(NAPI_GRO_CB(skb)->csum);
if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) {
if (unlikely(!sum) && !skb->csum_complete_sw)
netdev_rx_csum_fault(skb->dev);
} else {
skb->ip_summed = CHECKSUM_COMPLETE;
skb->csum_complete_sw = 1;
}
return sum;
}
static struct sk_buff **gre_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
......@@ -192,22 +170,15 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
if (unlikely(!greh))
goto out_unlock;
}
if (greh->flags & GRE_CSUM) { /* Need to verify GRE csum first */
__sum16 csum = 0;
if (skb->ip_summed == CHECKSUM_COMPLETE)
csum = csum_fold(NAPI_GRO_CB(skb)->csum);
/* Don't trust csum error calculated/reported by h/w */
if (skb->ip_summed == CHECKSUM_NONE || csum != 0)
csum = gro_skb_checksum(skb);
/* GRE CSUM is the 1's complement of the 1's complement sum
* of the GRE hdr plus payload so it should add up to 0xffff
* (and 0 after csum_fold()) just like the IPv4 hdr csum.
*/
if (csum)
/* Don't bother verifying checksum if we're going to flush anyway. */
if (greh->flags & GRE_CSUM) {
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_simple_validate(skb))
goto out_unlock;
NAPI_GRO_CB(skb)->encapsulation++;
}
flush = 0;
for (p = *head; p; p = p->next) {
......
......@@ -288,35 +288,14 @@ static int tcp_v4_gso_send_check(struct sk_buff *skb)
static struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
{
/* Use the IP hdr immediately proceeding for this transport */
const struct iphdr *iph = skb_gro_network_header(skb);
__wsum wsum;
/* Don't bother verifying checksum if we're going to flush anyway. */
if (NAPI_GRO_CB(skb)->flush)
goto skip_csum;
wsum = NAPI_GRO_CB(skb)->csum;
switch (skb->ip_summed) {
case CHECKSUM_NONE:
wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
0);
/* fall through */
case CHECKSUM_COMPLETE:
if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
wsum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
inet_gro_compute_pseudo)) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
skip_csum:
return tcp_gro_receive(head, skb);
}
......
......@@ -99,6 +99,7 @@
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
......
......@@ -228,29 +228,22 @@ void udp_del_offload(struct udp_offload *uo)
}
EXPORT_SYMBOL(udp_del_offload);
static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb)
struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
struct udphdr *uh)
{
struct udp_offload_priv *uo_priv;
struct sk_buff *p, **pp = NULL;
struct udphdr *uh, *uh2;
unsigned int hlen, off;
struct udphdr *uh2;
unsigned int off = skb_gro_offset(skb);
int flush = 1;
if (NAPI_GRO_CB(skb)->udp_mark ||
(!skb->encapsulation && skb->ip_summed != CHECKSUM_COMPLETE))
(!skb->encapsulation && !NAPI_GRO_CB(skb)->csum_valid))
goto out;
/* mark that this skb passed once through the udp gro layer */
NAPI_GRO_CB(skb)->udp_mark = 1;
off = skb_gro_offset(skb);
hlen = off + sizeof(*uh);
uh = skb_gro_header_fast(skb, off);
if (skb_gro_header_hard(skb, hlen)) {
uh = skb_gro_header_slow(skb, hlen, off);
if (unlikely(!uh))
goto out;
}
NAPI_GRO_CB(skb)->encapsulation++;
rcu_read_lock();
uo_priv = rcu_dereference(udp_offload_base);
......@@ -269,7 +262,12 @@ static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *s
continue;
uh2 = (struct udphdr *)(p->data + off);
if ((*(u32 *)&uh->source != *(u32 *)&uh2->source)) {
/* Match ports and either checksums are either both zero
* or nonzero.
*/
if ((*(u32 *)&uh->source != *(u32 *)&uh2->source) ||
(!uh->check ^ !uh2->check)) {
NAPI_GRO_CB(p)->same_flow = 0;
continue;
}
......@@ -286,7 +284,24 @@ static struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *s
return pp;
}
static int udp_gro_complete(struct sk_buff *skb, int nhoff)
static struct sk_buff **udp4_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
/* Don't bother verifying checksum if we're going to flush anyway. */
if (unlikely(!uh) ||
(!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
inet_gro_compute_pseudo))) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
return udp_gro_receive(head, skb, uh);
}
int udp_gro_complete(struct sk_buff *skb, int nhoff)
{
struct udp_offload_priv *uo_priv;
__be16 newlen = htons(skb->len - nhoff);
......@@ -311,12 +326,24 @@ static int udp_gro_complete(struct sk_buff *skb, int nhoff)
return err;
}
int udp4_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct iphdr *iph = ip_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (uh->check)
uh->check = ~udp_v4_check(skb->len - nhoff, iph->saddr,
iph->daddr, 0);
return udp_gro_complete(skb, nhoff);
}
static const struct net_offload udpv4_offload = {
.callbacks = {
.gso_send_check = udp4_ufo_send_check,
.gso_segment = udp4_ufo_fragment,
.gro_receive = udp_gro_receive,
.gro_complete = udp_gro_complete,
.gro_receive = udp4_gro_receive,
.gro_complete = udp4_gro_complete,
},
};
......
......@@ -35,34 +35,14 @@ static int tcp_v6_gso_send_check(struct sk_buff *skb)
static struct sk_buff **tcp6_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
const struct ipv6hdr *iph = skb_gro_network_header(skb);
__wsum wsum;
/* Don't bother verifying checksum if we're going to flush anyway. */
if (NAPI_GRO_CB(skb)->flush)
goto skip_csum;
wsum = NAPI_GRO_CB(skb)->csum;
switch (skb->ip_summed) {
case CHECKSUM_NONE:
wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb),
wsum);
/* fall through */
case CHECKSUM_COMPLETE:
if (!tcp_v6_check(skb_gro_len(skb), &iph->saddr, &iph->daddr,
wsum)) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
break;
}
if (!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate(skb, IPPROTO_TCP,
ip6_gro_compute_pseudo)) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
skip_csum:
return tcp_gro_receive(head, skb);
}
......
......@@ -10,6 +10,7 @@
* UDPv6 GSO support
*/
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <net/protocol.h>
#include <net/ipv6.h>
#include <net/udp.h>
......@@ -127,10 +128,42 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
out:
return segs;
}
static struct sk_buff **udp6_gro_receive(struct sk_buff **head,
struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
/* Don't bother verifying checksum if we're going to flush anyway. */
if (unlikely(!uh) ||
(!NAPI_GRO_CB(skb)->flush &&
skb_gro_checksum_validate_zero_check(skb, IPPROTO_UDP, uh->check,
ip6_gro_compute_pseudo))) {
NAPI_GRO_CB(skb)->flush = 1;
return NULL;
}
return udp_gro_receive(head, skb, uh);
}
int udp6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
struct udphdr *uh = (struct udphdr *)(skb->data + nhoff);
if (uh->check)
uh->check = ~udp_v6_check(skb->len - nhoff, &ipv6h->saddr,
&ipv6h->daddr, 0);
return udp_gro_complete(skb, nhoff);
}
static const struct net_offload udpv6_offload = {
.callbacks = {
.gso_send_check = udp6_ufo_send_check,
.gso_segment = udp6_ufo_fragment,
.gro_receive = udp6_gro_receive,
.gro_complete = udp6_gro_complete,
},
};
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment