Commit 07355737 authored by David S. Miller's avatar David S. Miller

Merge branch 'ipv4-multipath-hash'

Peter Nørlund says:

====================
ipv4: Hash-based multipath routing

When the routing cache was removed in 3.6, the IPv4 multipath algorithm changed
from more or less being destination-based into being quasi-random per-packet
scheduling. This increases the risk of out-of-order packets and makes it
impossible to use multipath together with anycast services.

This patch series replaces the old implementation with flow-based load
balancing based on a hash over the source and destination addresses.

Distribution of the hash is done with thresholds as described in RFC 2992.
This reduces the disruption when a path is added/remove when having more than
two paths.

To futher the chance of successful usage in conjuction with anycast, ICMP
error packets are hashed over the inner IP addresses. This ensures that PMTU
will work together with anycast or load-balancers such as IPVS.

Port numbers are not considered since fragments could cause problems with
anycast and IPVS. Relying on the DF-flag for TCP packets is also insufficient,
since ICMP inspection effectively extracts information from the opposite
flow which might have a different state of the DF-flag. This is also why the
RSS hash is not used. These are typically based on the NDIS RSS spec which
mandates TCP support.

Measurements of the additional overhead of a two-path multipath
(p_mkroute_input excl. __mkroute_input) on a Xeon X3550 (4 cores, 2.66GHz):

Original per-packet: ~394 cycles/packet
L3 hash:              ~76 cycles/packet

Changes in v5:
- Fixed compilation error

Changes in v4:
- Functions take hash directly instead of func ptr
- Added inline hash function
- Added dummy macros to minimize ifdefs
- Use upper 31 bits of hash instead of lower

Changes in v3:
- Multipath algorithm is no longer configurable (always L3)
- Added random seed to hash
- Moved ICMP inspection to isolated function
- Ignore source quench packets (deprecated as per RFC 6633)

Changes in v2:
- Replaced 8-bit xor hash with 31-bit jenkins hash
- Don't scale weights (since 31-bit)
- Avoided unnecesary renaming of variables
- Rely on DF-bit instead of fragment offset when checking for fragmentation
- upper_bound is now inclusive to avoid overflow
- Use a callback to postpone extracting flow information until necessary
- Skipped ICMP inspection entirely with L4 hashing
- Handle newly added sysctl ignore_routes_with_linkdown
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 2472186f 79a13159
......@@ -79,7 +79,7 @@ struct fib_nh {
unsigned char nh_scope;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int nh_weight;
int nh_power;
atomic_t nh_upper_bound;
#endif
#ifdef CONFIG_IP_ROUTE_CLASSID
__u32 nh_tclassid;
......@@ -118,7 +118,7 @@ struct fib_info {
#define fib_advmss fib_metrics[RTAX_ADVMSS-1]
int fib_nhs;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
int fib_power;
int fib_weight;
#endif
struct rcu_head rcu;
struct fib_nh fib_nh[0];
......@@ -320,7 +320,15 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev);
int fib_sync_down_dev(struct net_device *dev, unsigned long event);
int fib_sync_down_addr(struct net *net, __be32 local);
int fib_sync_up(struct net_device *dev, unsigned int nh_flags);
void fib_select_multipath(struct fib_result *res);
extern u32 fib_multipath_secret __read_mostly;
static inline int fib_multipath_hash(__be32 saddr, __be32 daddr)
{
return jhash_2words(saddr, daddr, fib_multipath_secret) >> 1;
}
void fib_select_multipath(struct fib_result *res, int hash);
/* Exported by fib_trie.c */
void fib_trie_init(void);
......
......@@ -28,6 +28,7 @@
#include <net/inetpeer.h>
#include <net/flow.h>
#include <net/inet_sock.h>
#include <net/ip_fib.h>
#include <net/l3mdev.h>
#include <linux/in_route.h>
#include <linux/rtnetlink.h>
......@@ -113,7 +114,15 @@ struct in_device;
int ip_rt_init(void);
void rt_cache_flush(struct net *net);
void rt_flush_dev(struct net_device *dev);
struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp,
int mp_hash);
static inline struct rtable *__ip_route_output_key(struct net *net,
struct flowi4 *flp)
{
return __ip_route_output_key_hash(net, flp, -1);
}
struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
const struct sock *sk);
struct dst_entry *ipv4_blackhole_route(struct net *net,
......
......@@ -57,8 +57,7 @@ static unsigned int fib_info_cnt;
static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
#ifdef CONFIG_IP_ROUTE_MULTIPATH
static DEFINE_SPINLOCK(fib_multipath_lock);
u32 fib_multipath_secret __read_mostly;
#define for_nexthops(fi) { \
int nhsel; const struct fib_nh *nh; \
......@@ -532,7 +531,67 @@ static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
return ret;
}
#endif
static void fib_rebalance(struct fib_info *fi)
{
int total;
int w;
struct in_device *in_dev;
if (fi->fib_nhs < 2)
return;
total = 0;
for_nexthops(fi) {
if (nh->nh_flags & RTNH_F_DEAD)
continue;
in_dev = __in_dev_get_rcu(nh->nh_dev);
if (in_dev &&
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
nh->nh_flags & RTNH_F_LINKDOWN)
continue;
total += nh->nh_weight;
} endfor_nexthops(fi);
w = 0;
change_nexthops(fi) {
int upper_bound;
in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
upper_bound = -1;
} else if (in_dev &&
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
upper_bound = -1;
} else {
w += nexthop_nh->nh_weight;
upper_bound = DIV_ROUND_CLOSEST(2147483648LL * w,
total) - 1;
}
atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
} endfor_nexthops(fi);
net_get_random_once(&fib_multipath_secret,
sizeof(fib_multipath_secret));
}
static inline void fib_add_weight(struct fib_info *fi,
const struct fib_nh *nh)
{
fi->fib_weight += nh->nh_weight;
}
#else /* CONFIG_IP_ROUTE_MULTIPATH */
#define fib_rebalance(fi) do { } while (0)
#define fib_add_weight(fi, nh) do { } while (0)
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
static int fib_encap_match(struct net *net, u16 encap_type,
struct nlattr *encap,
......@@ -1094,8 +1153,11 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
fib_add_weight(fi, nexthop_nh);
} endfor_nexthops(fi)
fib_rebalance(fi);
link_it:
ofi = fib_find_info(fi);
if (ofi) {
......@@ -1317,12 +1379,6 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
break;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
spin_lock_bh(&fib_multipath_lock);
fi->fib_power -= nexthop_nh->nh_power;
nexthop_nh->nh_power = 0;
spin_unlock_bh(&fib_multipath_lock);
#endif
dead++;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
......@@ -1345,6 +1401,8 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event)
}
ret++;
}
fib_rebalance(fi);
}
return ret;
......@@ -1467,20 +1525,15 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
!__in_dev_get_rtnl(dev))
continue;
alive++;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
spin_lock_bh(&fib_multipath_lock);
nexthop_nh->nh_power = 0;
nexthop_nh->nh_flags &= ~nh_flags;
spin_unlock_bh(&fib_multipath_lock);
#else
nexthop_nh->nh_flags &= ~nh_flags;
#endif
} endfor_nexthops(fi)
if (alive > 0) {
fi->fib_flags &= ~nh_flags;
ret++;
}
fib_rebalance(fi);
}
return ret;
......@@ -1488,62 +1541,19 @@ int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/*
* The algorithm is suboptimal, but it provides really
* fair weighted route distribution.
*/
void fib_select_multipath(struct fib_result *res)
void fib_select_multipath(struct fib_result *res, int hash)
{
struct fib_info *fi = res->fi;
struct in_device *in_dev;
int w;
spin_lock_bh(&fib_multipath_lock);
if (fi->fib_power <= 0) {
int power = 0;
change_nexthops(fi) {
in_dev = __in_dev_get_rcu(nexthop_nh->nh_dev);
if (nexthop_nh->nh_flags & RTNH_F_DEAD)
continue;
if (in_dev &&
IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
continue;
power += nexthop_nh->nh_weight;
nexthop_nh->nh_power = nexthop_nh->nh_weight;
} endfor_nexthops(fi);
fi->fib_power = power;
if (power <= 0) {
spin_unlock_bh(&fib_multipath_lock);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
return;
}
}
/* w should be random number [0..fi->fib_power-1],
* it is pretty bad approximation.
*/
w = jiffies % fi->fib_power;
for_nexthops(fi) {
if (hash > atomic_read(&nh->nh_upper_bound))
continue;
change_nexthops(fi) {
if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) &&
nexthop_nh->nh_power) {
w -= nexthop_nh->nh_power;
if (w <= 0) {
nexthop_nh->nh_power--;
fi->fib_power--;
res->nh_sel = nhsel;
spin_unlock_bh(&fib_multipath_lock);
return;
}
}
res->nh_sel = nhsel;
return;
} endfor_nexthops(fi);
/* Race condition: route has just become dead. */
res->nh_sel = 0;
spin_unlock_bh(&fib_multipath_lock);
}
#endif
......@@ -440,6 +440,22 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
icmp_xmit_unlock(sk);
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/* Source and destination is swapped. See ip_multipath_icmp_hash */
static int icmp_multipath_hash_skb(const struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
return fib_multipath_hash(iph->daddr, iph->saddr);
}
#else
#define icmp_multipath_hash_skb(skb) (-1)
#endif
static struct rtable *icmp_route_lookup(struct net *net,
struct flowi4 *fl4,
struct sk_buff *skb_in,
......@@ -464,7 +480,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev);
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = __ip_route_output_key(net, fl4);
rt = __ip_route_output_key_hash(net, fl4,
icmp_multipath_hash_skb(skb_in));
if (IS_ERR(rt))
return rt;
......
......@@ -1651,6 +1651,48 @@ static int __mkroute_input(struct sk_buff *skb,
return err;
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
/* To make ICMP packets follow the right flow, the multipath hash is
* calculated from the inner IP addresses in reverse order.
*/
static int ip_multipath_icmp_hash(struct sk_buff *skb)
{
const struct iphdr *outer_iph = ip_hdr(skb);
struct icmphdr _icmph;
const struct icmphdr *icmph;
struct iphdr _inner_iph;
const struct iphdr *inner_iph;
if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
goto standard_hash;
icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
&_icmph);
if (!icmph)
goto standard_hash;
if (icmph->type != ICMP_DEST_UNREACH &&
icmph->type != ICMP_REDIRECT &&
icmph->type != ICMP_TIME_EXCEEDED &&
icmph->type != ICMP_PARAMETERPROB) {
goto standard_hash;
}
inner_iph = skb_header_pointer(skb,
outer_iph->ihl * 4 + sizeof(_icmph),
sizeof(_inner_iph), &_inner_iph);
if (!inner_iph)
goto standard_hash;
return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
standard_hash:
return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
}
#endif /* CONFIG_IP_ROUTE_MULTIPATH */
static int ip_mkroute_input(struct sk_buff *skb,
struct fib_result *res,
const struct flowi4 *fl4,
......@@ -1658,8 +1700,15 @@ static int ip_mkroute_input(struct sk_buff *skb,
__be32 daddr, __be32 saddr, u32 tos)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res->fi && res->fi->fib_nhs > 1)
fib_select_multipath(res);
if (res->fi && res->fi->fib_nhs > 1) {
int h;
if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
h = ip_multipath_icmp_hash(skb);
else
h = fib_multipath_hash(saddr, daddr);
fib_select_multipath(res, h);
}
#endif
/* create a routing cache entry */
......@@ -2026,7 +2075,8 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
* Major route resolver routine.
*/
struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
int mp_hash)
{
struct net_device *dev_out = NULL;
__u8 tos = RT_FL_TOS(fl4);
......@@ -2189,8 +2239,11 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
}
#ifdef CONFIG_IP_ROUTE_MULTIPATH
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0)
fib_select_multipath(&res);
if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) {
if (mp_hash < 0)
mp_hash = fib_multipath_hash(fl4->saddr, fl4->daddr);
fib_select_multipath(&res, mp_hash);
}
else
#endif
if (!res.prefixlen &&
......@@ -2212,7 +2265,7 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4)
rcu_read_unlock();
return rth;
}
EXPORT_SYMBOL_GPL(__ip_route_output_key);
EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment