Commit 94720e3a authored by Julian Anastasov's avatar Julian Anastasov Committed by David S. Miller

ipv4: fix fnhe usage by non-cached routes

Allow some non-cached routes to use non-expired fnhe:

1. ip_del_fnhe: moved above and now called by find_exception.
The 4.5+ commit deed49df expires fnhe only when caching
routes. Change that to:

1.1. use fnhe for non-cached local output routes, with the help
from (2)

1.2. allow __mkroute_input to detect expired fnhe (outdated
fnhe_gw, for example) when do_cache is false, eg. when itag!=0
for unicast destinations.

2. __mkroute_output: keep fi to allow local routes with orig_oif != 0
to use fnhe info even when the new route will not be cached into fnhe.
After commit 839da4d9 ("net: ipv4: set orig_oif based on fib
result for local traffic") it means all local routes will be affected
because they are not cached. This change is used to solve a PMTU
problem with IPVS (and probably Netfilter DNAT) setups that redirect
local clients from target local IP (local route to Virtual IP)
to new remote IP target, eg. IPVS TUN real server. Loopback has
64K MTU and we need to create fnhe on the local route that will
keep the reduced PMTU for the Virtual IP. Without this change
fnhe_pmtu is updated from ICMP but never exposed to non-cached
local routes. This includes routes with flowi4_oif!=0 for 4.6+ and
with flowi4_oif=any for 4.14+).

3. update_or_create_fnhe: make sure fnhe_expires is not 0 for
new entries

Fixes: 839da4d9 ("net: ipv4: set orig_oif based on fib result for local traffic")
Fixes: d6d5e999 ("route: do not cache fib route info on local routes with oif")
Fixes: deed49df ("route: check and remove route cache when we get route")
Cc: David Ahern <dsahern@gmail.com>
Cc: Xin Long <lucien.xin@gmail.com>
Signed-off-by: default avatarJulian Anastasov <ja@ssi.bg>
Acked-by: default avatarDavid Ahern <dsahern@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent e002434e
...@@ -709,7 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, ...@@ -709,7 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_gw = gw; fnhe->fnhe_gw = gw;
fnhe->fnhe_pmtu = pmtu; fnhe->fnhe_pmtu = pmtu;
fnhe->fnhe_mtu_locked = lock; fnhe->fnhe_mtu_locked = lock;
fnhe->fnhe_expires = expires; fnhe->fnhe_expires = max(1UL, expires);
/* Exception created; mark the cached routes for the nexthop /* Exception created; mark the cached routes for the nexthop
* stale, so anyone caching it rechecks if this exception * stale, so anyone caching it rechecks if this exception
...@@ -1297,6 +1297,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) ...@@ -1297,6 +1297,36 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
return mtu - lwtunnel_headroom(dst->lwtstate, mtu); return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
} }
static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe, __rcu **fnhe_p;
u32 hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
hash = rcu_dereference_protected(nh->nh_exceptions,
lockdep_is_held(&fnhe_lock));
hash += hval;
fnhe_p = &hash->chain;
fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
while (fnhe) {
if (fnhe->fnhe_daddr == daddr) {
rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
fnhe_flush_routes(fnhe);
kfree_rcu(fnhe, rcu);
break;
}
fnhe_p = &fnhe->fnhe_next;
fnhe = rcu_dereference_protected(fnhe->fnhe_next,
lockdep_is_held(&fnhe_lock));
}
spin_unlock_bh(&fnhe_lock);
}
static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
{ {
struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
...@@ -1310,8 +1340,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) ...@@ -1310,8 +1340,14 @@ static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
for (fnhe = rcu_dereference(hash[hval].chain); fnhe; for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
fnhe = rcu_dereference(fnhe->fnhe_next)) { fnhe = rcu_dereference(fnhe->fnhe_next)) {
if (fnhe->fnhe_daddr == daddr) if (fnhe->fnhe_daddr == daddr) {
if (fnhe->fnhe_expires &&
time_after(jiffies, fnhe->fnhe_expires)) {
ip_del_fnhe(nh, daddr);
break;
}
return fnhe; return fnhe;
}
} }
return NULL; return NULL;
} }
...@@ -1636,36 +1672,6 @@ static void ip_handle_martian_source(struct net_device *dev, ...@@ -1636,36 +1672,6 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif #endif
} }
static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
{
struct fnhe_hash_bucket *hash;
struct fib_nh_exception *fnhe, __rcu **fnhe_p;
u32 hval = fnhe_hashfun(daddr);
spin_lock_bh(&fnhe_lock);
hash = rcu_dereference_protected(nh->nh_exceptions,
lockdep_is_held(&fnhe_lock));
hash += hval;
fnhe_p = &hash->chain;
fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
while (fnhe) {
if (fnhe->fnhe_daddr == daddr) {
rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
fnhe_flush_routes(fnhe);
kfree_rcu(fnhe, rcu);
break;
}
fnhe_p = &fnhe->fnhe_next;
fnhe = rcu_dereference_protected(fnhe->fnhe_next,
lockdep_is_held(&fnhe_lock));
}
spin_unlock_bh(&fnhe_lock);
}
/* called in rcu_read_lock() section */ /* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb, static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res, const struct fib_result *res,
...@@ -1719,20 +1725,10 @@ static int __mkroute_input(struct sk_buff *skb, ...@@ -1719,20 +1725,10 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr); fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) { if (do_cache) {
if (fnhe) { if (fnhe)
rth = rcu_dereference(fnhe->fnhe_rth_input); rth = rcu_dereference(fnhe->fnhe_rth_input);
if (rth && rth->dst.expires && else
time_after(jiffies, rth->dst.expires)) { rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
ip_del_fnhe(&FIB_RES_NH(*res), daddr);
fnhe = NULL;
} else {
goto rt_cache;
}
}
rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
rt_cache:
if (rt_cache_valid(rth)) { if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst); skb_dst_set_noref(skb, &rth->dst);
goto out; goto out;
...@@ -2216,39 +2212,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res, ...@@ -2216,39 +2212,31 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
* the loopback interface and the IP_PKTINFO ipi_ifindex will * the loopback interface and the IP_PKTINFO ipi_ifindex will
* be set to the loopback interface as well. * be set to the loopback interface as well.
*/ */
fi = NULL; do_cache = false;
} }
fnhe = NULL; fnhe = NULL;
do_cache &= fi != NULL; do_cache &= fi != NULL;
if (do_cache) { if (fi) {
struct rtable __rcu **prth; struct rtable __rcu **prth;
struct fib_nh *nh = &FIB_RES_NH(*res); struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr); fnhe = find_exception(nh, fl4->daddr);
if (!do_cache)
goto add;
if (fnhe) { if (fnhe) {
prth = &fnhe->fnhe_rth_output; prth = &fnhe->fnhe_rth_output;
rth = rcu_dereference(*prth); } else {
if (rth && rth->dst.expires && if (unlikely(fl4->flowi4_flags &
time_after(jiffies, rth->dst.expires)) { FLOWI_FLAG_KNOWN_NH &&
ip_del_fnhe(nh, fl4->daddr); !(nh->nh_gw &&
fnhe = NULL; nh->nh_scope == RT_SCOPE_LINK))) {
} else { do_cache = false;
goto rt_cache; goto add;
} }
prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
} }
if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH &&
!(nh->nh_gw &&
nh->nh_scope == RT_SCOPE_LINK))) {
do_cache = false;
goto add;
}
prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth); rth = rcu_dereference(*prth);
rt_cache:
if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
return rth; return rth;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment