Commit a6db4494 authored by David Ahern's avatar David Ahern Committed by David S. Miller

net: ipv4: Consider failed nexthops in multipath routes

Multipath route lookups should consider knowledge about next hops and not
select a hop that is known to be failed.

Example:

                     [h2]                   [h3]   15.0.0.5
                      |                      |
                     3|                     3|
                    [SP1]                  [SP2]--+
                     1  2                   1     2
                     |  |     /-------------+     |
                     |   \   /                    |
                     |     X                      |
                     |    / \                     |
                     |   /   \---------------\    |
                     1  2                     1   2
         12.0.0.2  [TOR1] 3-----------------3 [TOR2] 12.0.0.3
                     4                         4
                      \                       /
                        \                    /
                         \                  /
                          -------|   |-----/
                                 1   2
                                [TOR3]
                                  3|
                                   |
                                  [h1]  12.0.0.1

host h1 with IP 12.0.0.1 has 2 paths to host h3 at 15.0.0.5:

    root@h1:~# ip ro ls
    ...
    12.0.0.0/24 dev swp1  proto kernel  scope link  src 12.0.0.1
    15.0.0.0/16
            nexthop via 12.0.0.2  dev swp1 weight 1
            nexthop via 12.0.0.3  dev swp1 weight 1
    ...

If the link between tor3 and tor1 is down and the link between tor1
and tor2 then tor1 is effectively cut-off from h1. Yet the route lookups
in h1 are alternating between the 2 routes: ping 15.0.0.5 gets one and
ssh 15.0.0.5 gets the other. Connections that attempt to use the
12.0.0.2 nexthop fail since that neighbor is not reachable:

    root@h1:~# ip neigh show
    ...
    12.0.0.3 dev swp1 lladdr 00:02:00:00:00:1b REACHABLE
    12.0.0.2 dev swp1  FAILED
    ...

The failed path can be avoided by considering known neighbor information
when selecting next hops. If the neighbor lookup fails we have no
knowledge about the nexthop, so give it a shot. If there is an entry
then only select the nexthop if the state is sane. This is similar to
what fib_detect_death does.

To maintain backward compatibility use of the neighbor information is
based on a new sysctl, fib_multipath_use_neigh.
Signed-off-by: default avatarDavid Ahern <dsa@cumulusnetworks.com>
Reviewed-by: default avatarJulian Anastasov <ja@ssi.bg>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0b0e30c6
...@@ -63,6 +63,16 @@ fwmark_reflect - BOOLEAN ...@@ -63,6 +63,16 @@ fwmark_reflect - BOOLEAN
fwmark of the packet they are replying to. fwmark of the packet they are replying to.
Default: 0 Default: 0
fib_multipath_use_neigh - BOOLEAN
Use status of existing neighbor entry when determining nexthop for
multipath routes. If disabled, neighbor information is not used and
packets could be directed to a failed nexthop. Only valid for kernels
built with CONFIG_IP_ROUTE_MULTIPATH enabled.
Default: 0 (disabled)
Possible values:
0 - disabled
1 - enabled
route/max_size - INTEGER route/max_size - INTEGER
Maximum number of routes allowed in the kernel. Increase Maximum number of routes allowed in the kernel. Increase
this when using large numbers of interfaces and/or routes. this when using large numbers of interfaces and/or routes.
......
...@@ -132,6 +132,9 @@ struct netns_ipv4 { ...@@ -132,6 +132,9 @@ struct netns_ipv4 {
struct list_head mr_tables; struct list_head mr_tables;
struct fib_rules_ops *mr_rules_ops; struct fib_rules_ops *mr_rules_ops;
#endif #endif
#endif
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int sysctl_fib_multipath_use_neigh;
#endif #endif
atomic_t rt_genid; atomic_t rt_genid;
}; };
......
...@@ -1559,21 +1559,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags) ...@@ -1559,21 +1559,45 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
} }
#ifdef CONFIG_IP_ROUTE_MULTIPATH #ifdef CONFIG_IP_ROUTE_MULTIPATH
static bool fib_good_nh(const struct fib_nh *nh)
{
int state = NUD_REACHABLE;
if (nh->nh_scope == RT_SCOPE_LINK) {
struct neighbour *n;
rcu_read_lock_bh();
n = __ipv4_neigh_lookup_noref(nh->nh_dev, nh->nh_gw);
if (n)
state = n->nud_state;
rcu_read_unlock_bh();
}
return !!(state & NUD_VALID);
}
void fib_select_multipath(struct fib_result *res, int hash) void fib_select_multipath(struct fib_result *res, int hash)
{ {
struct fib_info *fi = res->fi; struct fib_info *fi = res->fi;
struct net *net = fi->fib_net;
bool first = false;
for_nexthops(fi) { for_nexthops(fi) {
if (hash > atomic_read(&nh->nh_upper_bound)) if (hash > atomic_read(&nh->nh_upper_bound))
continue; continue;
res->nh_sel = nhsel; if (!net->ipv4.sysctl_fib_multipath_use_neigh ||
return; fib_good_nh(nh)) {
res->nh_sel = nhsel;
return;
}
if (!first) {
res->nh_sel = nhsel;
first = true;
}
} endfor_nexthops(fi); } endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
} }
#endif #endif
......
...@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -960,6 +960,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec, .proc_handler = proc_dointvec,
}, },
#ifdef CONFIG_IP_ROUTE_MULTIPATH
{
.procname = "fib_multipath_use_neigh",
.data = &init_net.ipv4.sysctl_fib_multipath_use_neigh,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
#endif
{ } { }
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment