Commit 7ba9d103 authored by David S. Miller's avatar David S. Miller

Merge branch 'mpls-fragmentation-and-gso-fixes'

David Ahern says:

====================
net: mpls: fragmentation and gso fixes for locally originated traffic

This series fixes mtu and fragmentation for tunnels using lwtunnel
output redirect, and fixes GSO for MPLS for locally originated traffic
reported by Lennert Buytenhek.

A follow on series will address fragmentation and GSO for forwarded
MPLS traffic. Hardware offload of GSO with MPLS also needs to be
addressed.

Simon: Can you verify this works with OVS for single and multiple
       labels?

v4
- more updates to mpls_gso_segment per Alex's comments (thanks, Alex)
- updates to teaching OVS about marking MPLS labels as the network header

v3
- updates to mpls_gso_segment per Alex's comments
- dropped skb->encapsulation = 1 from mpls_xmit per Alex's comment

v2
- consistent use of network_header in skb to fix GSO for MPLS
- update MPLS code in OVS to network_header and inner_network_header
====================
Tested-by: default avatarSimon Horman <simon.horman@netronome.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 41852497 607fca9a
...@@ -340,6 +340,7 @@ static void veth_setup(struct net_device *dev) ...@@ -340,6 +340,7 @@ static void veth_setup(struct net_device *dev)
dev->hw_features = VETH_FEATURES; dev->hw_features = VETH_FEATURES;
dev->hw_enc_features = VETH_FEATURES; dev->hw_enc_features = VETH_FEATURES;
dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE;
} }
/* /*
......
...@@ -13,6 +13,13 @@ ...@@ -13,6 +13,13 @@
/* lw tunnel state flags */ /* lw tunnel state flags */
#define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0) #define LWTUNNEL_STATE_OUTPUT_REDIRECT BIT(0)
#define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1) #define LWTUNNEL_STATE_INPUT_REDIRECT BIT(1)
#define LWTUNNEL_STATE_XMIT_REDIRECT BIT(2)
enum {
LWTUNNEL_XMIT_DONE,
LWTUNNEL_XMIT_CONTINUE,
};
struct lwtunnel_state { struct lwtunnel_state {
__u16 type; __u16 type;
...@@ -21,6 +28,7 @@ struct lwtunnel_state { ...@@ -21,6 +28,7 @@ struct lwtunnel_state {
int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb); int (*orig_output)(struct net *net, struct sock *sk, struct sk_buff *skb);
int (*orig_input)(struct sk_buff *); int (*orig_input)(struct sk_buff *);
int len; int len;
__u16 headroom;
__u8 data[0]; __u8 data[0];
}; };
...@@ -34,6 +42,7 @@ struct lwtunnel_encap_ops { ...@@ -34,6 +42,7 @@ struct lwtunnel_encap_ops {
struct lwtunnel_state *lwtstate); struct lwtunnel_state *lwtstate);
int (*get_encap_size)(struct lwtunnel_state *lwtstate); int (*get_encap_size)(struct lwtunnel_state *lwtstate);
int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b); int (*cmp_encap)(struct lwtunnel_state *a, struct lwtunnel_state *b);
int (*xmit)(struct sk_buff *skb);
}; };
#ifdef CONFIG_LWTUNNEL #ifdef CONFIG_LWTUNNEL
...@@ -75,6 +84,24 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) ...@@ -75,6 +84,24 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
return false; return false;
} }
static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
if (lwtstate && (lwtstate->flags & LWTUNNEL_STATE_XMIT_REDIRECT))
return true;
return false;
}
static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
unsigned int mtu)
{
if (lwtunnel_xmit_redirect(lwtstate) && lwtstate->headroom < mtu)
return lwtstate->headroom;
return 0;
}
int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
unsigned int num); unsigned int num);
int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op,
...@@ -90,6 +117,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len); ...@@ -90,6 +117,7 @@ struct lwtunnel_state *lwtunnel_state_alloc(int hdr_len);
int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b); int lwtunnel_cmp_encap(struct lwtunnel_state *a, struct lwtunnel_state *b);
int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb); int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb);
int lwtunnel_input(struct sk_buff *skb); int lwtunnel_input(struct sk_buff *skb);
int lwtunnel_xmit(struct sk_buff *skb);
#else #else
...@@ -117,6 +145,17 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate) ...@@ -117,6 +145,17 @@ static inline bool lwtunnel_input_redirect(struct lwtunnel_state *lwtstate)
return false; return false;
} }
static inline bool lwtunnel_xmit_redirect(struct lwtunnel_state *lwtstate)
{
return false;
}
static inline unsigned int lwtunnel_headroom(struct lwtunnel_state *lwtstate,
unsigned int mtu)
{
return 0;
}
static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, static inline int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op,
unsigned int num) unsigned int num)
{ {
...@@ -170,6 +209,11 @@ static inline int lwtunnel_input(struct sk_buff *skb) ...@@ -170,6 +209,11 @@ static inline int lwtunnel_input(struct sk_buff *skb)
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
static inline int lwtunnel_xmit(struct sk_buff *skb)
{
return -EOPNOTSUPP;
}
#endif /* CONFIG_LWTUNNEL */ #endif /* CONFIG_LWTUNNEL */
#define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type)) #define MODULE_ALIAS_RTNL_LWT(encap_type) MODULE_ALIAS("rtnl-lwt-" __stringify(encap_type))
......
...@@ -251,6 +251,41 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -251,6 +251,41 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb)
} }
EXPORT_SYMBOL(lwtunnel_output); EXPORT_SYMBOL(lwtunnel_output);
int lwtunnel_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
const struct lwtunnel_encap_ops *ops;
struct lwtunnel_state *lwtstate;
int ret = -EINVAL;
if (!dst)
goto drop;
lwtstate = dst->lwtstate;
if (lwtstate->type == LWTUNNEL_ENCAP_NONE ||
lwtstate->type > LWTUNNEL_ENCAP_MAX)
return 0;
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[lwtstate->type]);
if (likely(ops && ops->xmit))
ret = ops->xmit(skb);
rcu_read_unlock();
if (ret == -EOPNOTSUPP)
goto drop;
return ret;
drop:
kfree_skb(skb);
return ret;
}
EXPORT_SYMBOL(lwtunnel_xmit);
int lwtunnel_input(struct sk_buff *skb) int lwtunnel_input(struct sk_buff *skb)
{ {
struct dst_entry *dst = skb_dst(skb); struct dst_entry *dst = skb_dst(skb);
......
...@@ -73,6 +73,7 @@ ...@@ -73,6 +73,7 @@
#include <net/icmp.h> #include <net/icmp.h>
#include <net/checksum.h> #include <net/checksum.h>
#include <net/inetpeer.h> #include <net/inetpeer.h>
#include <net/lwtunnel.h>
#include <linux/igmp.h> #include <linux/igmp.h>
#include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h> #include <linux/netfilter_bridge.h>
...@@ -197,6 +198,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s ...@@ -197,6 +198,13 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
skb = skb2; skb = skb2;
} }
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
if (res < 0 || res == LWTUNNEL_XMIT_DONE)
return res;
}
rcu_read_lock_bh(); rcu_read_lock_bh();
nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr); nexthop = (__force u32) rt_nexthop(rt, ip_hdr(skb)->daddr);
neigh = __ipv4_neigh_lookup_noref(dev, nexthop); neigh = __ipv4_neigh_lookup_noref(dev, nexthop);
......
...@@ -1246,7 +1246,9 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) ...@@ -1246,7 +1246,9 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
mtu = 576; mtu = 576;
} }
return min_t(unsigned int, mtu, IP_MAX_MTU); mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
} }
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <net/checksum.h> #include <net/checksum.h>
#include <linux/mroute6.h> #include <linux/mroute6.h>
#include <net/l3mdev.h> #include <net/l3mdev.h>
#include <net/lwtunnel.h>
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
...@@ -104,6 +105,13 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * ...@@ -104,6 +105,13 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
} }
} }
if (lwtunnel_xmit_redirect(dst->lwtstate)) {
int res = lwtunnel_xmit(skb);
if (res < 0 || res == LWTUNNEL_XMIT_DONE)
return res;
}
rcu_read_lock_bh(); rcu_read_lock_bh();
nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
......
...@@ -1604,7 +1604,9 @@ static unsigned int ip6_mtu(const struct dst_entry *dst) ...@@ -1604,7 +1604,9 @@ static unsigned int ip6_mtu(const struct dst_entry *dst)
rcu_read_unlock(); rcu_read_unlock();
out: out:
return min_t(unsigned int, mtu, IP6_MAX_MTU); mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
} }
static struct dst_entry *icmp6_dst_gc_list; static struct dst_entry *icmp6_dst_gc_list;
......
...@@ -23,32 +23,50 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb, ...@@ -23,32 +23,50 @@ static struct sk_buff *mpls_gso_segment(struct sk_buff *skb,
netdev_features_t features) netdev_features_t features)
{ {
struct sk_buff *segs = ERR_PTR(-EINVAL); struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
netdev_features_t mpls_features; netdev_features_t mpls_features;
u16 mac_len = skb->mac_len;
__be16 mpls_protocol; __be16 mpls_protocol;
unsigned int mpls_hlen;
skb_reset_network_header(skb);
mpls_hlen = skb_inner_network_header(skb) - skb_network_header(skb);
if (unlikely(!pskb_may_pull(skb, mpls_hlen)))
goto out;
/* Setup inner SKB. */ /* Setup inner SKB. */
mpls_protocol = skb->protocol; mpls_protocol = skb->protocol;
skb->protocol = skb->inner_protocol; skb->protocol = skb->inner_protocol;
/* Push back the mac header that skb_mac_gso_segment() has pulled. __skb_pull(skb, mpls_hlen);
* It will be re-pulled by the call to skb_mac_gso_segment() below
*/ skb->mac_len = 0;
__skb_push(skb, skb->mac_len); skb_reset_mac_header(skb);
/* Segment inner packet. */ /* Segment inner packet. */
mpls_features = skb->dev->mpls_features & features; mpls_features = skb->dev->mpls_features & features;
segs = skb_mac_gso_segment(skb, mpls_features); segs = skb_mac_gso_segment(skb, mpls_features);
if (IS_ERR_OR_NULL(segs)) {
skb_gso_error_unwind(skb, mpls_protocol, mpls_hlen, mac_offset,
mac_len);
goto out;
}
skb = segs;
mpls_hlen += mac_len;
do {
skb->mac_len = mac_len;
skb->protocol = mpls_protocol;
skb_reset_inner_network_header(skb);
/* Restore outer protocol. */ __skb_push(skb, mpls_hlen);
skb->protocol = mpls_protocol;
/* Re-pull the mac header that the call to skb_mac_gso_segment() skb_reset_mac_header(skb);
* above pulled. It will be re-pushed after returning skb_set_network_header(skb, mac_len);
* skb_mac_gso_segment(), an indirect caller of this function. } while ((skb = skb->next));
*/
__skb_pull(skb, skb->data - skb_mac_header(skb));
out:
return segs; return segs;
} }
......
...@@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) ...@@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
return en->labels * sizeof(struct mpls_shim_hdr); return en->labels * sizeof(struct mpls_shim_hdr);
} }
static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) static int mpls_xmit(struct sk_buff *skb)
{ {
struct mpls_iptunnel_encap *tun_encap_info; struct mpls_iptunnel_encap *tun_encap_info;
struct mpls_shim_hdr *hdr; struct mpls_shim_hdr *hdr;
...@@ -90,7 +90,11 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -90,7 +90,11 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
if (skb_cow(skb, hh_len + new_header_size)) if (skb_cow(skb, hh_len + new_header_size))
goto drop; goto drop;
skb_set_inner_protocol(skb, skb->protocol);
skb_reset_inner_network_header(skb);
skb_push(skb, new_header_size); skb_push(skb, new_header_size);
skb_reset_network_header(skb); skb_reset_network_header(skb);
skb->dev = out_dev; skb->dev = out_dev;
...@@ -115,7 +119,7 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb) ...@@ -115,7 +119,7 @@ static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
net_dbg_ratelimited("%s: packet transmission failed: %d\n", net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err); __func__, err);
return 0; return LWTUNNEL_XMIT_DONE;
drop: drop:
kfree_skb(skb); kfree_skb(skb);
...@@ -153,7 +157,8 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla, ...@@ -153,7 +157,8 @@ static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
if (ret) if (ret)
goto errout; goto errout;
newts->type = LWTUNNEL_ENCAP_MPLS; newts->type = LWTUNNEL_ENCAP_MPLS;
newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
newts->headroom = mpls_encap_size(tun_encap_info);
*ts = newts; *ts = newts;
...@@ -209,7 +214,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) ...@@ -209,7 +214,7 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
static const struct lwtunnel_encap_ops mpls_iptun_ops = { static const struct lwtunnel_encap_ops mpls_iptun_ops = {
.build_state = mpls_build_state, .build_state = mpls_build_state,
.output = mpls_output, .xmit = mpls_xmit,
.fill_encap = mpls_fill_encap_info, .fill_encap = mpls_fill_encap_info,
.get_encap_size = mpls_encap_nlsize, .get_encap_size = mpls_encap_nlsize,
.cmp_encap = mpls_encap_cmp, .cmp_encap = mpls_encap_cmp,
......
...@@ -162,10 +162,16 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, ...@@ -162,10 +162,16 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
if (skb_cow_head(skb, MPLS_HLEN) < 0) if (skb_cow_head(skb, MPLS_HLEN) < 0)
return -ENOMEM; return -ENOMEM;
if (!skb->inner_protocol) {
skb_set_inner_network_header(skb, skb->mac_len);
skb_set_inner_protocol(skb, skb->protocol);
}
skb_push(skb, MPLS_HLEN); skb_push(skb, MPLS_HLEN);
memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb), memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
skb->mac_len); skb->mac_len);
skb_reset_mac_header(skb); skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);
new_mpls_lse = (__be32 *)skb_mpls_header(skb); new_mpls_lse = (__be32 *)skb_mpls_header(skb);
*new_mpls_lse = mpls->mpls_lse; *new_mpls_lse = mpls->mpls_lse;
...@@ -173,8 +179,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key, ...@@ -173,8 +179,6 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN); skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype); update_ethertype(skb, eth_hdr(skb), mpls->mpls_ethertype);
if (!skb->inner_protocol)
skb_set_inner_protocol(skb, skb->protocol);
skb->protocol = mpls->mpls_ethertype; skb->protocol = mpls->mpls_ethertype;
invalidate_flow_key(key); invalidate_flow_key(key);
...@@ -198,6 +202,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key, ...@@ -198,6 +202,7 @@ static int pop_mpls(struct sk_buff *skb, struct sw_flow_key *key,
__skb_pull(skb, MPLS_HLEN); __skb_pull(skb, MPLS_HLEN);
skb_reset_mac_header(skb); skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->mac_len);
/* skb_mpls_header() is used to locate the ethertype /* skb_mpls_header() is used to locate the ethertype
* field correctly in the presence of VLAN tags. * field correctly in the presence of VLAN tags.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment