Commit e341694e authored by Thomas Graf's avatar Thomas Graf Committed by David S. Miller

netlink: Convert netlink_lookup() to use RCU protected hash table

Heavy Netlink users such as Open vSwitch spend a considerable amount of
time in netlink_lookup() due to the read-lock on nl_table_lock. Use of
RCU relieves the lock contention.

Makes use of the new resizable hash table to avoid locking on the
lookup.

The hash table will grow if entries exceeds 75% of table size up to a
total table size of 64K. It will automatically shrink if usage falls
below 30%.

Also splits nl_table_lock into a separate mutex to protect hash table
mutations and allow synchronize_rcu() to sleep while waiting for readers
during expansion and shrinking.

Before:
   9.16%  kpktgend_0  [openvswitch]      [k] masked_flow_lookup
   6.42%  kpktgend_0  [pktgen]           [k] mod_cur_headers
   6.26%  kpktgend_0  [pktgen]           [k] pktgen_thread_worker
   6.23%  kpktgend_0  [kernel.kallsyms]  [k] memset
   4.79%  kpktgend_0  [kernel.kallsyms]  [k] netlink_lookup
   4.37%  kpktgend_0  [kernel.kallsyms]  [k] memcpy
   3.60%  kpktgend_0  [openvswitch]      [k] ovs_flow_extract
   2.69%  kpktgend_0  [kernel.kallsyms]  [k] jhash2

After:
  15.26%  kpktgend_0  [openvswitch]      [k] masked_flow_lookup
   8.12%  kpktgend_0  [pktgen]           [k] pktgen_thread_worker
   7.92%  kpktgend_0  [pktgen]           [k] mod_cur_headers
   5.11%  kpktgend_0  [kernel.kallsyms]  [k] memset
   4.11%  kpktgend_0  [openvswitch]      [k] ovs_flow_extract
   4.06%  kpktgend_0  [kernel.kallsyms]  [k] _raw_spin_lock
   3.90%  kpktgend_0  [kernel.kallsyms]  [k] jhash2
   [...]
   0.67%  kpktgend_0  [kernel.kallsyms]  [k] netlink_lookup
Signed-off-by: default avatarThomas Graf <tgraf@suug.ch>
Reviewed-by: default avatarNikolay Aleksandrov <nikolay@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 7e1e7763
This diff is collapsed.
#ifndef _AF_NETLINK_H #ifndef _AF_NETLINK_H
#define _AF_NETLINK_H #define _AF_NETLINK_H
#include <linux/rhashtable.h>
#include <net/sock.h> #include <net/sock.h>
#define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8)
...@@ -47,6 +48,8 @@ struct netlink_sock { ...@@ -47,6 +48,8 @@ struct netlink_sock {
struct netlink_ring tx_ring; struct netlink_ring tx_ring;
atomic_t mapped; atomic_t mapped;
#endif /* CONFIG_NETLINK_MMAP */ #endif /* CONFIG_NETLINK_MMAP */
struct rhash_head node;
}; };
static inline struct netlink_sock *nlk_sk(struct sock *sk) static inline struct netlink_sock *nlk_sk(struct sock *sk)
...@@ -54,21 +57,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk) ...@@ -54,21 +57,8 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
return container_of(sk, struct netlink_sock, sk); return container_of(sk, struct netlink_sock, sk);
} }
struct nl_portid_hash {
struct hlist_head *table;
unsigned long rehash_time;
unsigned int mask;
unsigned int shift;
unsigned int entries;
unsigned int max_shift;
u32 rnd;
};
struct netlink_table { struct netlink_table {
struct nl_portid_hash hash; struct rhashtable hash;
struct hlist_head mc_list; struct hlist_head mc_list;
struct listeners __rcu *listeners; struct listeners __rcu *listeners;
unsigned int flags; unsigned int flags;
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
#include <linux/netlink.h> #include <linux/netlink.h>
#include <linux/sock_diag.h> #include <linux/sock_diag.h>
#include <linux/netlink_diag.h> #include <linux/netlink_diag.h>
#include <linux/rhashtable.h>
#include "af_netlink.h" #include "af_netlink.h"
...@@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, ...@@ -101,16 +102,20 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
int protocol, int s_num) int protocol, int s_num)
{ {
struct netlink_table *tbl = &nl_table[protocol]; struct netlink_table *tbl = &nl_table[protocol];
struct nl_portid_hash *hash = &tbl->hash; struct rhashtable *ht = &tbl->hash;
const struct bucket_table *htbl = rht_dereference(ht->tbl, ht);
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
struct netlink_diag_req *req; struct netlink_diag_req *req;
struct netlink_sock *nlsk;
struct sock *sk; struct sock *sk;
int ret = 0, num = 0, i; int ret = 0, num = 0, i;
req = nlmsg_data(cb->nlh); req = nlmsg_data(cb->nlh);
for (i = 0; i <= hash->mask; i++) { for (i = 0; i < htbl->size; i++) {
sk_for_each(sk, &hash->table[i]) { rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) {
sk = (struct sock *)nlsk;
if (!net_eq(sock_net(sk), net)) if (!net_eq(sock_net(sk), net))
continue; continue;
if (num < s_num) { if (num < s_num) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment