Commit 2af48d43 authored by David S. Miller's avatar David S. Miller

Merge branch 'fib6-rcu'

Wei Wang says:

====================
ipv6: replace rwlock with rcu and spinlock in fib6 table

Currently, fib6 table is protected by rwlock. During route lookup,
reader lock is taken and during route insertion, deletion or
modification, writer lock is taken. This is a very inefficient
implementation because the fastpath always has to do the operation
to grab the reader lock.
According to my latest syn flood test on an iota ivybridage machine
with 2 10G mlx nics bonded together, each with 8 rx queues on 2 NUMA
nodes, and with the upstream net-next kernel:
ipv4 stack can handle around 4.2Mpps
ipv6 stack can handle around 1.3Mpps

In order to close the gap of the performance number between ipv4
and ipv6 stack, this patch series tries to get rid of the usage of
the rwlock and replace it with rcu and spinlock protection. This will
greatly speed up the fastpath performance as it only needs to hold
rcu which is much less expensive than grabbing the reader lock. It
also makes ipv6 fib implementation more consistent with ipv4.

In order to be able to replace the current rwlock with rcu and
spinlock, some preparation work is needed:
Patch 1-8 introduces a per-route hash table (protected by rcu and a
different spinlock) to store all cached routes created by pmtu and ip
redirect under its main route. This makes the main fib6 tree only
contain static routes.
Patch 9-14 prepares all the reader path to be ready to tolerate
concurrent writer.
Patch 15 finally does the rwlock to rcu and spinlock conversion.
Patch 16 takes care of rt6_stats.

After this patch series, in the same syn flood test,
ipv6 stack can now handle around 3.5Mpps compared to previous 1.3Mpps
in my test setup.

After this patch series, there are still some improvements that should
be done in ipv6 stack:
1. During route lookup, dst_use() is called everytime on the selected
route to update dst->__use and dst->lastuse. This dirties the cacheline
and causes extra cacheline miss and should be avoided.
2. when no route is found in the current table, net->ip6.ipv6_null_entry
is used and refcnt is taken on it. As there is no pcpu cache for this
specific route, frequent change on the refcnt for this route causes
quite some cacheline misses.
And to make things worse, if CONFIG_IPV6_MULTIPLE_TABLES is defined,
output path route lookup always starts with local table first and
guarantees to hit net->ipv6.ip6_null_entry before continuing to do
lookup in the main table.
These operations on net->ipv6.ip6_null_entry could potentially be
avoided.
3. ipv6 input path route lookup grabs refcnt on dst. This is different
from ipv4. We could potentially change this behavior to let ipv6 input
path route lookup not to grab refcnt on dst. However, it does not give
us much performance boost as we currently have pcpu route cache for
input path as well in ipv6. But this work probably is still worth doing
to unify ipv6 and ipv4 route lookup behavior.

The above issues will be addressed separately after this patch series
has been accepted.

This is a joint work with Martin KaFai Lau and Eric Dumazet. And many
many thanks to them for their inspiring ideas and big big code review
efforts.
====================
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Reviewed-by: default avatarYOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 0d7b70e8 81eb8447
......@@ -101,7 +101,7 @@ struct dst_entry {
union {
struct dst_entry *next;
struct rtable __rcu *rt_next;
struct rt6_info *rt6_next;
struct rt6_info __rcu *rt6_next;
struct dn_route __rcu *dn_next;
};
};
......
......@@ -29,6 +29,14 @@
#define FIB6_TABLE_HASHSZ 1
#endif
#define RT6_DEBUG 2
#if RT6_DEBUG >= 3
#define RT6_TRACE(x...) pr_debug(x)
#else
#define RT6_TRACE(x...) do { ; } while (0)
#endif
struct rt6_info;
struct fib6_config {
......@@ -60,25 +68,30 @@ struct fib6_config {
};
struct fib6_node {
struct fib6_node *parent;
struct fib6_node *left;
struct fib6_node *right;
struct fib6_node __rcu *parent;
struct fib6_node __rcu *left;
struct fib6_node __rcu *right;
#ifdef CONFIG_IPV6_SUBTREES
struct fib6_node *subtree;
struct fib6_node __rcu *subtree;
#endif
struct rt6_info *leaf;
struct rt6_info __rcu *leaf;
__u16 fn_bit; /* bit key */
__u16 fn_flags;
int fn_sernum;
struct rt6_info *rr_ptr;
struct rt6_info __rcu *rr_ptr;
struct rcu_head rcu;
};
struct fib6_gc_args {
int timeout;
int more;
};
#ifndef CONFIG_IPV6_SUBTREES
#define FIB6_SUBTREE(fn) NULL
#else
#define FIB6_SUBTREE(fn) ((fn)->subtree)
#define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1))
#endif
struct mx6_config {
......@@ -98,6 +111,22 @@ struct rt6key {
struct fib6_table;
struct rt6_exception_bucket {
struct hlist_head chain;
int depth;
};
struct rt6_exception {
struct hlist_node hlist;
struct rt6_info *rt6i;
unsigned long stamp;
struct rcu_head rcu;
};
#define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10
#define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT)
#define FIB6_MAX_DEPTH 5
struct rt6_info {
struct dst_entry dst;
......@@ -134,14 +163,25 @@ struct rt6_info {
struct inet6_dev *rt6i_idev;
struct rt6_info * __percpu *rt6i_pcpu;
struct rt6_exception_bucket __rcu *rt6i_exception_bucket;
u32 rt6i_metric;
u32 rt6i_pmtu;
/* more non-fragment space at head required */
unsigned short rt6i_nfheader_len;
u8 rt6i_protocol;
u8 exception_bucket_flushed:1,
unused:7;
};
#define for_each_fib6_node_rt_rcu(fn) \
for (rt = rcu_dereference((fn)->leaf); rt; \
rt = rcu_dereference(rt->dst.rt6_next))
#define for_each_fib6_walker_rt(w) \
for (rt = (w)->leaf; rt; \
rt = rcu_dereference_protected(rt->dst.rt6_next, 1))
static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst)
{
return ((struct rt6_info *)dst)->rt6i_idev;
......@@ -188,6 +228,8 @@ static inline bool rt6_get_cookie_safe(const struct rt6_info *rt,
if (fn) {
*cookie = fn->fn_sernum;
/* pairs with smp_wmb() in fib6_update_sernum_upto_root() */
smp_rmb();
status = true;
}
......@@ -248,7 +290,6 @@ struct fib6_walker {
struct fib6_node *root, *node;
struct rt6_info *leaf;
enum fib6_walk_state state;
bool prune;
unsigned int skip;
unsigned int count;
int (*func)(struct fib6_walker *);
......@@ -256,12 +297,15 @@ struct fib6_walker {
};
struct rt6_statistics {
__u32 fib_nodes;
__u32 fib_route_nodes;
__u32 fib_rt_alloc; /* permanent routes */
__u32 fib_rt_entries; /* rt entries in table */
__u32 fib_rt_cache; /* cache routes */
__u32 fib_discarded_routes;
__u32 fib_nodes; /* all fib6 nodes */
__u32 fib_route_nodes; /* intermediate nodes */
__u32 fib_rt_entries; /* rt entries in fib table */
__u32 fib_rt_cache; /* cached rt entries in exception table */
__u32 fib_discarded_routes; /* total number of routes delete */
/* The following stats are not protected by any lock */
atomic_t fib_rt_alloc; /* total number of routes alloced */
atomic_t fib_rt_uncache; /* rt entries in uncached list */
};
#define RTN_TL_ROOT 0x0001
......@@ -277,7 +321,7 @@ struct rt6_statistics {
struct fib6_table {
struct hlist_node tb6_hlist;
u32 tb6_id;
rwlock_t tb6_lock;
spinlock_t tb6_lock;
struct fib6_node tb6_root;
struct inet_peer_base tb6_peers;
unsigned int flags;
......@@ -325,7 +369,8 @@ struct fib6_node *fib6_lookup(struct fib6_node *root,
struct fib6_node *fib6_locate(struct fib6_node *root,
const struct in6_addr *daddr, int dst_len,
const struct in6_addr *saddr, int src_len);
const struct in6_addr *saddr, int src_len,
bool exact_match);
void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
void *arg);
......@@ -358,6 +403,8 @@ void __net_exit fib6_notifier_exit(struct net *net);
unsigned int fib6_tables_seq_read(struct net *net);
int fib6_tables_dump(struct net *net, struct notifier_block *nb);
void fib6_update_sernum(struct rt6_info *rt);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
int fib6_rules_init(void);
void fib6_rules_cleanup(void);
......
......@@ -95,6 +95,11 @@ int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack);
int ip6_ins_rt(struct rt6_info *);
int ip6_del_rt(struct rt6_info *);
void rt6_flush_exceptions(struct rt6_info *rt);
int rt6_remove_exception_rt(struct rt6_info *rt);
void rt6_age_exceptions(struct rt6_info *rt, struct fib6_gc_args *gc_args,
unsigned long now);
static inline int ip6_route_get_saddr(struct net *net, struct rt6_info *rt,
const struct in6_addr *daddr,
unsigned int prefs,
......
......@@ -2321,24 +2321,24 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
if (!table)
return NULL;
read_lock_bh(&table->tb6_lock);
fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0);
rcu_read_lock();
fn = fib6_locate(&table->tb6_root, pfx, plen, NULL, 0, true);
if (!fn)
goto out;
noflags |= RTF_CACHE;
for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
for_each_fib6_node_rt_rcu(fn) {
if (rt->dst.dev->ifindex != dev->ifindex)
continue;
if ((rt->rt6i_flags & flags) != flags)
continue;
if ((rt->rt6i_flags & noflags) != 0)
continue;
dst_hold(&rt->dst);
if (!dst_hold_safe(&rt->dst))
rt = NULL;
break;
}
out:
read_unlock_bh(&table->tb6_lock);
rcu_read_unlock();
return rt;
}
......@@ -5898,10 +5898,9 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
spin_lock(&ifa->lock);
if (ifa->rt) {
struct rt6_info *rt = ifa->rt;
struct fib6_table *table = rt->rt6i_table;
int cpu;
read_lock(&table->tb6_lock);
rcu_read_lock();
addrconf_set_nopolicy(ifa->rt, val);
if (rt->rt6i_pcpu) {
for_each_possible_cpu(cpu) {
......@@ -5911,7 +5910,7 @@ void addrconf_disable_policy_idev(struct inet6_dev *idev, int val)
addrconf_set_nopolicy(*rtp, val);
}
}
read_unlock(&table->tb6_lock);
rcu_read_unlock();
}
spin_unlock(&ifa->lock);
}
......
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment