Commit 81fea579 authored by David S. Miller's avatar David S. Miller

Merge branch 'vxlan-xmit-improvements'

Pravin B Shelar says:

====================
vxlan: xmit improvements.

Following patch series improves vxlan fast path, removes
duplicate code and simplifies vxlan xmit code path.

v2-v3:
Removed unrelated warning fix from patch 2.
rearranged error handling from patch 3
Fixed stats updates in vxlan route lookup in patch 4

v1-v2:
Fix compilation error when IPv6 support is not enabled.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents c915fe13 9efdb92d
......@@ -1750,21 +1750,16 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
}
min_headroom = LL_RESERVED_SPACE(dst->dev) + dst->header_len
+ VXLAN_HLEN + iphdr_len
+ (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
+ VXLAN_HLEN + iphdr_len;
/* Need space for new headers (invalidates iph ptr) */
err = skb_cow_head(skb, min_headroom);
if (unlikely(err))
goto out_free;
skb = vlan_hwaccel_push_inside(skb);
if (WARN_ON(!skb))
return -ENOMEM;
return err;
err = iptunnel_handle_offloads(skb, type);
if (err)
goto out_free;
return err;
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = VXLAN_HF_VNI;
......@@ -1788,19 +1783,16 @@ static int vxlan_build_skb(struct sk_buff *skb, struct dst_entry *dst,
if (vxflags & VXLAN_F_GPE) {
err = vxlan_build_gpe_hdr(vxh, vxflags, skb->protocol);
if (err < 0)
goto out_free;
return err;
inner_protocol = skb->protocol;
}
skb_set_inner_protocol(skb, inner_protocol);
return 0;
out_free:
kfree_skb(skb);
return err;
}
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan, struct net_device *dev,
struct vxlan_sock *sock4,
struct sk_buff *skb, int oif, u8 tos,
__be32 daddr, __be32 *saddr,
struct dst_cache *dst_cache,
......@@ -1810,6 +1802,9 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
struct rtable *rt = NULL;
struct flowi4 fl4;
if (!sock4)
return ERR_PTR(-EIO);
if (tos && !info)
use_cache = false;
if (use_cache) {
......@@ -1827,16 +1822,27 @@ static struct rtable *vxlan_get_route(struct vxlan_dev *vxlan,
fl4.saddr = *saddr;
rt = ip_route_output_key(vxlan->net, &fl4);
if (!IS_ERR(rt)) {
if (likely(!IS_ERR(rt))) {
if (rt->dst.dev == dev) {
netdev_dbg(dev, "circular route to %pI4\n", &daddr);
ip_rt_put(rt);
return ERR_PTR(-ELOOP);
}
*saddr = fl4.saddr;
if (use_cache)
dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
} else {
netdev_dbg(dev, "no route to %pI4\n", &daddr);
return ERR_PTR(-ENETUNREACH);
}
return rt;
}
#if IS_ENABLED(CONFIG_IPV6)
static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
struct net_device *dev,
struct vxlan_sock *sock6,
struct sk_buff *skb, int oif, u8 tos,
__be32 label,
const struct in6_addr *daddr,
......@@ -1844,7 +1850,6 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
struct dst_cache *dst_cache,
const struct ip_tunnel_info *info)
{
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
bool use_cache = ip_tunnel_dst_cache_usable(skb, info);
struct dst_entry *ndst;
struct flowi6 fl6;
......@@ -1872,8 +1877,16 @@ static struct dst_entry *vxlan6_get_route(struct vxlan_dev *vxlan,
err = ipv6_stub->ipv6_dst_lookup(vxlan->net,
sock6->sock->sk,
&ndst, &fl6);
if (err < 0)
return ERR_PTR(err);
if (unlikely(err < 0)) {
netdev_dbg(dev, "no route to %pI6\n", daddr);
return ERR_PTR(-ENETUNREACH);
}
if (unlikely(ndst->dev == dev)) {
netdev_dbg(dev, "circular route to %pI6\n", daddr);
dst_release(ndst);
return ERR_PTR(-ELOOP);
}
*saddr = fl6.saddr;
if (use_cache)
......@@ -1927,23 +1940,55 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
}
}
static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
struct vxlan_dev *vxlan, union vxlan_addr *daddr,
__be32 dst_port, __be32 vni, struct dst_entry *dst,
u32 rt_flags)
{
#if IS_ENABLED(CONFIG_IPV6)
/* IPv6 rt-flags are checked against RTF_LOCAL, but the value of
* RTF_LOCAL is equal to RTCF_LOCAL. So to keep code simple
* we can use RTCF_LOCAL which works for ipv4 and ipv6 route entry.
*/
BUILD_BUG_ON(RTCF_LOCAL != RTF_LOCAL);
#endif
/* Bypass encapsulation if the destination is local */
if (rt_flags & RTCF_LOCAL &&
!(rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
struct vxlan_dev *dst_vxlan;
dst_release(dst);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
daddr->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan) {
dev->stats.tx_errors++;
kfree_skb(skb);
return -ENOENT;
}
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return 1;
}
return 0;
}
static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct vxlan_rdst *rdst, bool did_rsc)
{
struct dst_cache *dst_cache;
struct ip_tunnel_info *info;
struct vxlan_dev *vxlan = netdev_priv(dev);
struct sock *sk;
struct rtable *rt = NULL;
const struct iphdr *old_iph;
const struct iphdr *old_iph = ip_hdr(skb);
union vxlan_addr *dst;
union vxlan_addr remote_ip, local_ip;
union vxlan_addr *src;
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
__be16 src_port = 0, dst_port;
struct dst_entry *ndst = NULL;
__be32 vni, label;
__be16 df = 0;
__u8 tos, ttl;
int err;
u32 flags = vxlan->flags;
......@@ -1953,19 +1998,40 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
info = skb_tunnel_info(skb);
if (rdst) {
dst = &rdst->remote_ip;
if (vxlan_addr_any(dst)) {
if (did_rsc) {
/* short-circuited back to local bridge */
vxlan_encap_bypass(skb, vxlan, vxlan);
return;
}
goto drop;
}
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->cfg.dst_port;
vni = rdst->remote_vni;
dst = &rdst->remote_ip;
src = &vxlan->cfg.saddr;
dst_cache = &rdst->dst_cache;
md->gbp = skb->mark;
ttl = vxlan->cfg.ttl;
if (!ttl && vxlan_addr_multicast(dst))
ttl = 1;
tos = vxlan->cfg.tos;
if (tos == 1)
tos = ip_tunnel_get_dsfield(old_iph, skb);
if (dst->sa.sa_family == AF_INET)
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
else
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
label = vxlan->cfg.label;
} else {
if (!info) {
WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
dev->name);
goto drop;
}
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = tunnel_id_to_key32(info->key.tun_id);
remote_ip.sa.sa_family = ip_tunnel_info_af(info);
if (remote_ip.sa.sa_family == AF_INET) {
remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
......@@ -1975,182 +2041,108 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
local_ip.sin6.sin6_addr = info->key.u.ipv6.src;
}
dst = &remote_ip;
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = tunnel_id_to_key32(info->key.tun_id);
src = &local_ip;
dst_cache = &info->dst_cache;
}
if (vxlan_addr_any(dst)) {
if (did_rsc) {
/* short-circuited back to local bridge */
vxlan_encap_bypass(skb, vxlan, vxlan);
return;
}
goto drop;
}
old_iph = ip_hdr(skb);
ttl = vxlan->cfg.ttl;
if (!ttl && vxlan_addr_multicast(dst))
ttl = 1;
tos = vxlan->cfg.tos;
if (tos == 1)
tos = ip_tunnel_get_dsfield(old_iph, skb);
label = vxlan->cfg.label;
src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
vxlan->cfg.port_max, true);
if (info) {
if (info->options_len)
md = ip_tunnel_info_opts(info);
ttl = info->key.ttl;
tos = info->key.tos;
label = info->key.label;
udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
if (info->options_len)
md = ip_tunnel_info_opts(info);
} else {
md->gbp = skb->mark;
}
src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
vxlan->cfg.port_max, true);
if (dst->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
struct rtable *rt;
__be16 df = 0;
if (!sock4)
goto drop;
sk = sock4->sock->sk;
rt = vxlan_get_route(vxlan, skb,
rt = vxlan_get_route(vxlan, dev, sock4, skb,
rdst ? rdst->remote_ifindex : 0, tos,
dst->sin.sin_addr.s_addr,
&src->sin.sin_addr.s_addr,
dst_cache, info);
if (IS_ERR(rt)) {
netdev_dbg(dev, "no route to %pI4\n",
&dst->sin.sin_addr.s_addr);
dev->stats.tx_carrier_errors++;
if (IS_ERR(rt))
goto tx_error;
}
if (rt->dst.dev == dev) {
netdev_dbg(dev, "circular route to %pI4\n",
&dst->sin.sin_addr.s_addr);
dev->stats.collisions++;
goto rt_tx_error;
}
/* Bypass encapsulation if the destination is local */
if (!info && rt->rt_flags & RTCF_LOCAL &&
!(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
struct vxlan_dev *dst_vxlan;
ip_rt_put(rt);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
dst->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan)
goto tx_error;
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return;
}
if (!info)
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT)
if (!info) {
err = encap_bypass_if_local(skb, dev, vxlan, dst,
dst_port, vni, &rt->dst,
rt->rt_flags);
if (err)
return;
} else if (info->key.tun_flags & TUNNEL_DONT_FRAGMENT) {
df = htons(IP_DF);
}
ndst = &rt->dst;
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
err = vxlan_build_skb(skb, &rt->dst, sizeof(struct iphdr),
err = vxlan_build_skb(skb, ndst, sizeof(struct iphdr),
vni, md, flags, udp_sum);
if (err < 0)
goto xmit_tx_error;
goto tx_error;
udp_tunnel_xmit_skb(rt, sk, skb, src->sin.sin_addr.s_addr,
udp_tunnel_xmit_skb(rt, sock4->sock->sk, skb, src->sin.sin_addr.s_addr,
dst->sin.sin_addr.s_addr, tos, ttl, df,
src_port, dst_port, xnet, !udp_sum);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
struct dst_entry *ndst;
u32 rt6i_flags;
if (!sock6)
goto drop;
sk = sock6->sock->sk;
ndst = vxlan6_get_route(vxlan, skb,
ndst = vxlan6_get_route(vxlan, dev, sock6, skb,
rdst ? rdst->remote_ifindex : 0, tos,
label, &dst->sin6.sin6_addr,
&src->sin6.sin6_addr,
dst_cache, info);
if (IS_ERR(ndst)) {
netdev_dbg(dev, "no route to %pI6\n",
&dst->sin6.sin6_addr);
dev->stats.tx_carrier_errors++;
ndst = NULL;
goto tx_error;
}
if (ndst->dev == dev) {
netdev_dbg(dev, "circular route to %pI6\n",
&dst->sin6.sin6_addr);
dst_release(ndst);
dev->stats.collisions++;
goto tx_error;
}
if (!info) {
u32 rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
/* Bypass encapsulation if the destination is local */
rt6i_flags = ((struct rt6_info *)ndst)->rt6i_flags;
if (!info && rt6i_flags & RTF_LOCAL &&
!(rt6i_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
struct vxlan_dev *dst_vxlan;
dst_release(ndst);
dst_vxlan = vxlan_find_vni(vxlan->net, vni,
dst->sa.sa_family, dst_port,
vxlan->flags);
if (!dst_vxlan)
goto tx_error;
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return;
err = encap_bypass_if_local(skb, dev, vxlan, dst,
dst_port, vni, ndst,
rt6i_flags);
if (err)
return;
}
if (!info)
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip6_dst_hoplimit(ndst);
skb_scrub_packet(skb, xnet);
err = vxlan_build_skb(skb, ndst, sizeof(struct ipv6hdr),
vni, md, flags, udp_sum);
if (err < 0) {
dst_release(ndst);
dev->stats.tx_errors++;
return;
}
udp_tunnel6_xmit_skb(ndst, sk, skb, dev,
if (err < 0)
goto tx_error;
udp_tunnel6_xmit_skb(ndst, sock6->sock->sk, skb, dev,
&src->sin6.sin6_addr,
&dst->sin6.sin6_addr, tos, ttl,
label, src_port, dst_port, !udp_sum);
#endif
}
return;
drop:
dev->stats.tx_dropped++;
goto tx_free;
dev_kfree_skb(skb);
return;
xmit_tx_error:
/* skb is already freed. */
skb = NULL;
rt_tx_error:
ip_rt_put(rt);
tx_error:
if (err == -ELOOP)
dev->stats.collisions++;
else if (err == -ENETUNREACH)
dev->stats.tx_carrier_errors++;
dst_release(ndst);
dev->stats.tx_errors++;
tx_free:
dev_kfree_skb(skb);
kfree_skb(skb);
}
/* Transmit local packets over Vxlan
......@@ -2429,9 +2421,7 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
struct vxlan_sock *sock4 = rcu_dereference(vxlan->vn4_sock);
struct rtable *rt;
if (!sock4)
return -EINVAL;
rt = vxlan_get_route(vxlan, skb, 0, info->key.tos,
rt = vxlan_get_route(vxlan, dev, sock4, skb, 0, info->key.tos,
info->key.u.ipv4.dst,
&info->key.u.ipv4.src, NULL, info);
if (IS_ERR(rt))
......@@ -2439,9 +2429,10 @@ static int vxlan_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
ip_rt_put(rt);
} else {
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6 = rcu_dereference(vxlan->vn6_sock);
struct dst_entry *ndst;
ndst = vxlan6_get_route(vxlan, skb, 0, info->key.tos,
ndst = vxlan6_get_route(vxlan, dev, sock6, skb, 0, info->key.tos,
info->key.label, &info->key.u.ipv6.dst,
&info->key.u.ipv6.src, NULL, info);
if (IS_ERR(ndst))
......@@ -2529,10 +2520,8 @@ static void vxlan_setup(struct net_device *dev)
dev->features |= NETIF_F_GSO_SOFTWARE;
dev->vlan_features = dev->features;
dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
netif_keep_dst(dev);
dev->priv_flags |= IFF_NO_QUEUE;
......
......@@ -399,22 +399,6 @@ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb)
skb->vlan_tci = 0;
return skb;
}
/*
* vlan_hwaccel_push_inside - pushes vlan tag to the payload
* @skb: skbuff to tag
*
* Checks is tag is present in @skb->vlan_tci and if it is, it pushes the
* VLAN tag from @skb->vlan_tci inside to the payload.
*
* Following the skb_unshare() example, in case of error, the calling function
* doesn't have to worry about freeing the original skb.
*/
static inline struct sk_buff *vlan_hwaccel_push_inside(struct sk_buff *skb)
{
if (skb_vlan_tag_present(skb))
skb = __vlan_hwaccel_push_inside(skb);
return skb;
}
/**
* __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting
......
......@@ -281,16 +281,6 @@ struct vxlan_dev {
struct net_device *vxlan_dev_create(struct net *net, const char *name,
u8 name_assign_type, struct vxlan_config *conf);
static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan,
unsigned short family)
{
#if IS_ENABLED(CONFIG_IPV6)
if (family == AF_INET6)
return inet_sk(vxlan->vn6_sock->sock->sk)->inet_sport;
#endif
return inet_sk(vxlan->vn4_sock->sock->sk)->inet_sport;
}
static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
netdev_features_t features)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment