Commit cdf3464e authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by David S. Miller

ipv6: Fix dst_entry refcnt bugs in ip6_tunnel

Problems in the current dst_entry cache in the ip6_tunnel:

1. ip6_tnl_dst_set is racy.  There is no lock to protect it:
   - One major problem is that the dst refcnt gets messed up. F.e.
     the same dst_cache can be released multiple times and then
     triggering the infamous dst refcnt < 0 warning message.
   - Another issue is the inconsistency between dst_cache and
     dst_cookie.

   It can be reproduced by adding and removing the ip6gre tunnel
   while running a super_netperf TCP_CRR test.

2. ip6_tnl_dst_get does not take the dst refcnt before returning
   the dst.

This patch:
1. Create a percpu dst_entry cache in ip6_tnl
2. Use a spinlock to protect the dst_cache operations
3. ip6_tnl_dst_get always takes the dst refcnt before returning
Signed-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f230d1e8
...@@ -32,6 +32,12 @@ struct __ip6_tnl_parm { ...@@ -32,6 +32,12 @@ struct __ip6_tnl_parm {
__be32 o_key; __be32 o_key;
}; };
struct ip6_tnl_dst {
spinlock_t lock;
struct dst_entry *dst;
u32 cookie;
};
/* IPv6 tunnel */ /* IPv6 tunnel */
struct ip6_tnl { struct ip6_tnl {
struct ip6_tnl __rcu *next; /* next tunnel in list */ struct ip6_tnl __rcu *next; /* next tunnel in list */
...@@ -39,8 +45,7 @@ struct ip6_tnl { ...@@ -39,8 +45,7 @@ struct ip6_tnl {
struct net *net; /* netns for packet i/o */ struct net *net; /* netns for packet i/o */
struct __ip6_tnl_parm parms; /* tunnel configuration parameters */ struct __ip6_tnl_parm parms; /* tunnel configuration parameters */
struct flowi fl; /* flowi template for xmit */ struct flowi fl; /* flowi template for xmit */
struct dst_entry *dst_cache; /* cached dst */ struct ip6_tnl_dst __percpu *dst_cache; /* cached dst */
u32 dst_cookie;
int err_count; int err_count;
unsigned long err_time; unsigned long err_time;
...@@ -61,6 +66,8 @@ struct ipv6_tlv_tnl_enc_lim { ...@@ -61,6 +66,8 @@ struct ipv6_tlv_tnl_enc_lim {
} __packed; } __packed;
struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t); struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t);
int ip6_tnl_dst_init(struct ip6_tnl *t);
void ip6_tnl_dst_destroy(struct ip6_tnl *t);
void ip6_tnl_dst_reset(struct ip6_tnl *t); void ip6_tnl_dst_reset(struct ip6_tnl *t);
void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst); void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst);
int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr, int ip6_tnl_rcv_ctl(struct ip6_tnl *t, const struct in6_addr *laddr,
......
...@@ -637,17 +637,17 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ...@@ -637,17 +637,17 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
dst = ip6_tnl_dst_get(tunnel); dst = ip6_tnl_dst_get(tunnel);
if (!dst) { if (!dst) {
ndst = ip6_route_output(net, NULL, fl6); dst = ip6_route_output(net, NULL, fl6);
if (ndst->error) if (dst->error)
goto tx_err_link_failure; goto tx_err_link_failure;
ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
if (IS_ERR(ndst)) { if (IS_ERR(dst)) {
err = PTR_ERR(ndst); err = PTR_ERR(dst);
ndst = NULL; dst = NULL;
goto tx_err_link_failure; goto tx_err_link_failure;
} }
dst = ndst; ndst = dst;
} }
tdev = dst->dev; tdev = dst->dev;
...@@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ...@@ -702,12 +702,9 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
skb = new_skb; skb = new_skb;
} }
if (fl6->flowi6_mark) { if (!fl6->flowi6_mark && ndst)
ip6_tnl_dst_set(tunnel, ndst);
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
ndst = NULL;
} else {
skb_dst_set_noref(skb, dst);
}
proto = NEXTHDR_GRE; proto = NEXTHDR_GRE;
if (encap_limit >= 0) { if (encap_limit >= 0) {
...@@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb, ...@@ -762,14 +759,12 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
skb_set_inner_protocol(skb, protocol); skb_set_inner_protocol(skb, protocol);
ip6tunnel_xmit(NULL, skb, dev); ip6tunnel_xmit(NULL, skb, dev);
if (ndst)
ip6_tnl_dst_set(tunnel, ndst);
return 0; return 0;
tx_err_link_failure: tx_err_link_failure:
stats->tx_carrier_errors++; stats->tx_carrier_errors++;
dst_link_failure(skb); dst_link_failure(skb);
tx_err_dst_release: tx_err_dst_release:
dst_release(ndst); dst_release(dst);
return err; return err;
} }
...@@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = { ...@@ -1223,6 +1218,9 @@ static const struct net_device_ops ip6gre_netdev_ops = {
static void ip6gre_dev_free(struct net_device *dev) static void ip6gre_dev_free(struct net_device *dev)
{ {
struct ip6_tnl *t = netdev_priv(dev);
ip6_tnl_dst_destroy(t);
free_percpu(dev->tstats); free_percpu(dev->tstats);
free_netdev(dev); free_netdev(dev);
} }
...@@ -1248,6 +1246,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev) ...@@ -1248,6 +1246,7 @@ static void ip6gre_tunnel_setup(struct net_device *dev)
static int ip6gre_tunnel_init_common(struct net_device *dev) static int ip6gre_tunnel_init_common(struct net_device *dev)
{ {
struct ip6_tnl *tunnel; struct ip6_tnl *tunnel;
int ret;
tunnel = netdev_priv(dev); tunnel = netdev_priv(dev);
...@@ -1259,6 +1258,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) ...@@ -1259,6 +1258,13 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
if (!dev->tstats) if (!dev->tstats)
return -ENOMEM; return -ENOMEM;
ret = ip6_tnl_dst_init(tunnel);
if (ret) {
free_percpu(dev->tstats);
dev->tstats = NULL;
return ret;
}
return 0; return 0;
} }
......
...@@ -126,37 +126,90 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev) ...@@ -126,37 +126,90 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
* Locking : hash tables are protected by RCU and RTNL * Locking : hash tables are protected by RCU and RTNL
*/ */
struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t) static void __ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst,
struct dst_entry *dst)
{ {
struct dst_entry *dst = t->dst_cache; dst_release(idst->dst);
if (dst) {
if (dst && dst->obsolete && dst_hold(dst);
!dst->ops->check(dst, t->dst_cookie)) { idst->cookie = rt6_get_cookie((struct rt6_info *)dst);
t->dst_cache = NULL; } else {
dst_release(dst); idst->cookie = 0;
return NULL;
} }
idst->dst = dst;
}
static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst,
struct dst_entry *dst)
{
spin_lock_bh(&idst->lock);
__ip6_tnl_per_cpu_dst_set(idst, dst);
spin_unlock_bh(&idst->lock);
}
struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t)
{
struct ip6_tnl_dst *idst;
struct dst_entry *dst;
idst = raw_cpu_ptr(t->dst_cache);
spin_lock_bh(&idst->lock);
dst = idst->dst;
if (dst) {
if (!dst->obsolete || dst->ops->check(dst, idst->cookie)) {
dst_hold(idst->dst);
} else {
__ip6_tnl_per_cpu_dst_set(idst, NULL);
dst = NULL;
}
}
spin_unlock_bh(&idst->lock);
return dst; return dst;
} }
EXPORT_SYMBOL_GPL(ip6_tnl_dst_get); EXPORT_SYMBOL_GPL(ip6_tnl_dst_get);
void ip6_tnl_dst_reset(struct ip6_tnl *t) void ip6_tnl_dst_reset(struct ip6_tnl *t)
{ {
dst_release(t->dst_cache); int i;
t->dst_cache = NULL;
for_each_possible_cpu(i)
ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL);
} }
EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset); EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst) void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst)
{ {
struct rt6_info *rt = (struct rt6_info *) dst; ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst);
t->dst_cookie = rt6_get_cookie(rt);
dst_release(t->dst_cache);
t->dst_cache = dst;
} }
EXPORT_SYMBOL_GPL(ip6_tnl_dst_set); EXPORT_SYMBOL_GPL(ip6_tnl_dst_set);
void ip6_tnl_dst_destroy(struct ip6_tnl *t)
{
if (!t->dst_cache)
return;
ip6_tnl_dst_reset(t);
free_percpu(t->dst_cache);
}
EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy);
int ip6_tnl_dst_init(struct ip6_tnl *t)
{
int i;
t->dst_cache = alloc_percpu(struct ip6_tnl_dst);
if (!t->dst_cache)
return -ENOMEM;
for_each_possible_cpu(i)
spin_lock_init(&per_cpu_ptr(t->dst_cache, i)->lock);
return 0;
}
EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);
/** /**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses * ip6_tnl_lookup - fetch tunnel matching the end-point addresses
* @remote: the address of the tunnel exit-point * @remote: the address of the tunnel exit-point
...@@ -271,6 +324,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) ...@@ -271,6 +324,9 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t)
static void ip6_dev_free(struct net_device *dev) static void ip6_dev_free(struct net_device *dev)
{ {
struct ip6_tnl *t = netdev_priv(dev);
ip6_tnl_dst_destroy(t);
free_percpu(dev->tstats); free_percpu(dev->tstats);
free_netdev(dev); free_netdev(dev);
} }
...@@ -1016,17 +1072,17 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ...@@ -1016,17 +1072,17 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
goto tx_err_link_failure; goto tx_err_link_failure;
if (!dst) { if (!dst) {
ndst = ip6_route_output(net, NULL, fl6); dst = ip6_route_output(net, NULL, fl6);
if (ndst->error) if (dst->error)
goto tx_err_link_failure; goto tx_err_link_failure;
ndst = xfrm_lookup(net, ndst, flowi6_to_flowi(fl6), NULL, 0); dst = xfrm_lookup(net, dst, flowi6_to_flowi(fl6), NULL, 0);
if (IS_ERR(ndst)) { if (IS_ERR(dst)) {
err = PTR_ERR(ndst); err = PTR_ERR(dst);
ndst = NULL; dst = NULL;
goto tx_err_link_failure; goto tx_err_link_failure;
} }
dst = ndst; ndst = dst;
} }
tdev = dst->dev; tdev = dst->dev;
...@@ -1072,12 +1128,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ...@@ -1072,12 +1128,11 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
consume_skb(skb); consume_skb(skb);
skb = new_skb; skb = new_skb;
} }
if (fl6->flowi6_mark) {
if (!fl6->flowi6_mark && ndst)
ip6_tnl_dst_set(t, ndst);
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
ndst = NULL;
} else {
skb_dst_set_noref(skb, dst);
}
skb->transport_header = skb->network_header; skb->transport_header = skb->network_header;
proto = fl6->flowi6_proto; proto = fl6->flowi6_proto;
...@@ -1101,14 +1156,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, ...@@ -1101,14 +1156,12 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
ipv6h->saddr = fl6->saddr; ipv6h->saddr = fl6->saddr;
ipv6h->daddr = fl6->daddr; ipv6h->daddr = fl6->daddr;
ip6tunnel_xmit(NULL, skb, dev); ip6tunnel_xmit(NULL, skb, dev);
if (ndst)
ip6_tnl_dst_set(t, ndst);
return 0; return 0;
tx_err_link_failure: tx_err_link_failure:
stats->tx_carrier_errors++; stats->tx_carrier_errors++;
dst_link_failure(skb); dst_link_failure(skb);
tx_err_dst_release: tx_err_dst_release:
dst_release(ndst); dst_release(dst);
return err; return err;
} }
...@@ -1573,12 +1626,21 @@ static inline int ...@@ -1573,12 +1626,21 @@ static inline int
ip6_tnl_dev_init_gen(struct net_device *dev) ip6_tnl_dev_init_gen(struct net_device *dev)
{ {
struct ip6_tnl *t = netdev_priv(dev); struct ip6_tnl *t = netdev_priv(dev);
int ret;
t->dev = dev; t->dev = dev;
t->net = dev_net(dev); t->net = dev_net(dev);
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats) if (!dev->tstats)
return -ENOMEM; return -ENOMEM;
ret = ip6_tnl_dst_init(t);
if (ret) {
free_percpu(dev->tstats);
dev->tstats = NULL;
return ret;
}
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment