Commit 93bb0ceb authored by Jesper Dangaard Brouer's avatar Jesper Dangaard Brouer Committed by Pablo Neira Ayuso

netfilter: conntrack: remove central spinlock nf_conntrack_lock

nf_conntrack_lock is a monolithic lock and suffers from huge contention
on current generation servers (8 or more core/threads).

Perf locking congestion is clear on base kernel:

-  72.56%  ksoftirqd/6  [kernel.kallsyms]    [k] _raw_spin_lock_bh
   - _raw_spin_lock_bh
      + 25.33% init_conntrack
      + 24.86% nf_ct_delete_from_lists
      + 24.62% __nf_conntrack_confirm
      + 24.38% destroy_conntrack
      + 0.70% tcp_packet
+   2.21%  ksoftirqd/6  [kernel.kallsyms]    [k] fib_table_lookup
+   1.15%  ksoftirqd/6  [kernel.kallsyms]    [k] __slab_free
+   0.77%  ksoftirqd/6  [kernel.kallsyms]    [k] inet_getpeer
+   0.70%  ksoftirqd/6  [nf_conntrack]       [k] nf_ct_delete
+   0.55%  ksoftirqd/6  [ip_tables]          [k] ipt_do_table

This patch change conntrack locking and provides a huge performance
improvement.  SYN-flood attack tested on a 24-core E5-2695v2(ES) with
10Gbit/s ixgbe (with tool trafgen):

 Base kernel:   810.405 new conntrack/sec
 After patch: 2.233.876 new conntrack/sec

Notice other floods attack (SYN+ACK or ACK) can easily be deflected using:
 # iptables -A INPUT -m state --state INVALID -j DROP
 # sysctl -w net/netfilter/nf_conntrack_tcp_loose=0

Use an array of hashed spinlocks to protect insertions/deletions of
conntracks into the hash table. 1024 spinlocks seem to give good
results, at minimal cost (4KB memory). Due to lockdep max depth,
1024 becomes 8 if CONFIG_LOCKDEP=y

The hash resize is a bit tricky, because we need to take all locks in
the array. A seqcount_t is used to synchronize the hash table users
with the resizing process.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Reviewed-by: default avatarFlorian Westphal <fw@strlen.de>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent ca7433df
...@@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple, ...@@ -77,7 +77,12 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_l3proto *l3proto, const struct nf_conntrack_l3proto *l3proto,
const struct nf_conntrack_l4proto *proto); const struct nf_conntrack_l4proto *proto);
extern spinlock_t nf_conntrack_lock ; #ifdef CONFIG_LOCKDEP
# define CONNTRACK_LOCKS 8
#else
# define CONNTRACK_LOCKS 1024
#endif
extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
extern spinlock_t nf_conntrack_expect_lock; extern spinlock_t nf_conntrack_expect_lock;
......
...@@ -5,6 +5,7 @@ ...@@ -5,6 +5,7 @@
#include <linux/list_nulls.h> #include <linux/list_nulls.h>
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/netfilter/nf_conntrack_tcp.h> #include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/seqlock.h>
struct ctl_table_header; struct ctl_table_header;
struct nf_conntrack_ecache; struct nf_conntrack_ecache;
...@@ -90,6 +91,7 @@ struct netns_ct { ...@@ -90,6 +91,7 @@ struct netns_ct {
int sysctl_checksum; int sysctl_checksum;
unsigned int htable_size; unsigned int htable_size;
seqcount_t generation;
struct kmem_cache *nf_conntrack_cachep; struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash; struct hlist_nulls_head *hash;
struct hlist_head *expect_hash; struct hlist_head *expect_hash;
......
...@@ -60,12 +60,60 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct, ...@@ -60,12 +60,60 @@ int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
const struct nlattr *attr) __read_mostly; const struct nlattr *attr) __read_mostly;
EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook); EXPORT_SYMBOL_GPL(nfnetlink_parse_nat_setup_hook);
DEFINE_SPINLOCK(nf_conntrack_lock); __cacheline_aligned_in_smp spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
EXPORT_SYMBOL_GPL(nf_conntrack_lock); EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock); EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
{
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
spin_unlock(&nf_conntrack_locks[h1]);
if (h1 != h2)
spin_unlock(&nf_conntrack_locks[h2]);
}
/* return true if we need to recompute hashes (in case hash table was resized) */
static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
unsigned int h2, unsigned int sequence)
{
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
if (h1 <= h2) {
spin_lock(&nf_conntrack_locks[h1]);
if (h1 != h2)
spin_lock_nested(&nf_conntrack_locks[h2],
SINGLE_DEPTH_NESTING);
} else {
spin_lock(&nf_conntrack_locks[h2]);
spin_lock_nested(&nf_conntrack_locks[h1],
SINGLE_DEPTH_NESTING);
}
if (read_seqcount_retry(&net->ct.generation, sequence)) {
nf_conntrack_double_unlock(h1, h2);
return true;
}
return false;
}
static void nf_conntrack_all_lock(void)
{
int i;
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_lock_nested(&nf_conntrack_locks[i], i);
}
static void nf_conntrack_all_unlock(void)
{
int i;
for (i = 0; i < CONNTRACK_LOCKS; i++)
spin_unlock(&nf_conntrack_locks[i]);
}
unsigned int nf_conntrack_htable_size __read_mostly; unsigned int nf_conntrack_htable_size __read_mostly;
EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
...@@ -280,15 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct) ...@@ -280,15 +328,28 @@ destroy_conntrack(struct nf_conntrack *nfct)
static void nf_ct_delete_from_lists(struct nf_conn *ct) static void nf_ct_delete_from_lists(struct nf_conn *ct)
{ {
struct net *net = nf_ct_net(ct); struct net *net = nf_ct_net(ct);
unsigned int hash, reply_hash;
u16 zone = nf_ct_zone(ct);
unsigned int sequence;
nf_ct_helper_destroy(ct); nf_ct_helper_destroy(ct);
spin_lock_bh(&nf_conntrack_lock);
/* Inside lock so preempt is disabled on module removal path. local_bh_disable();
* Otherwise we can get spurious warnings. */ do {
NF_CT_STAT_INC(net, delete_list); sequence = read_seqcount_begin(&net->ct.generation);
hash = hash_conntrack(net, zone,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net, zone,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
clean_from_lists(ct); clean_from_lists(ct);
nf_conntrack_double_unlock(hash, reply_hash);
nf_ct_add_to_dying_list(ct); nf_ct_add_to_dying_list(ct);
spin_unlock_bh(&nf_conntrack_lock);
NF_CT_STAT_INC(net, delete_list);
local_bh_enable();
} }
static void death_by_event(unsigned long ul_conntrack) static void death_by_event(unsigned long ul_conntrack)
...@@ -372,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h, ...@@ -372,8 +433,6 @@ nf_ct_key_equal(struct nf_conntrack_tuple_hash *h,
* Warning : * Warning :
* - Caller must take a reference on returned object * - Caller must take a reference on returned object
* and recheck nf_ct_tuple_equal(tuple, &h->tuple) * and recheck nf_ct_tuple_equal(tuple, &h->tuple)
* OR
* - Caller must lock nf_conntrack_lock before calling this function
*/ */
static struct nf_conntrack_tuple_hash * static struct nf_conntrack_tuple_hash *
____nf_conntrack_find(struct net *net, u16 zone, ____nf_conntrack_find(struct net *net, u16 zone,
...@@ -467,14 +526,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) ...@@ -467,14 +526,18 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n; struct hlist_nulls_node *n;
u16 zone; u16 zone;
unsigned int sequence;
zone = nf_ct_zone(ct); zone = nf_ct_zone(ct);
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
hash = hash_conntrack(net, zone, hash = hash_conntrack(net, zone,
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
reply_hash = hash_conntrack(net, zone, reply_hash = hash_conntrack(net, zone,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple); &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
spin_lock_bh(&nf_conntrack_lock);
/* See if there's one in the list already, including reverse */ /* See if there's one in the list already, including reverse */
hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode)
...@@ -493,14 +556,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) ...@@ -493,14 +556,15 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct)
/* The caller holds a reference to this object */ /* The caller holds a reference to this object */
atomic_set(&ct->ct_general.use, 2); atomic_set(&ct->ct_general.use, 2);
__nf_conntrack_hash_insert(ct, hash, reply_hash); __nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert); NF_CT_STAT_INC(net, insert);
spin_unlock_bh(&nf_conntrack_lock); local_bh_enable();
return 0; return 0;
out: out:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed); NF_CT_STAT_INC(net, insert_failed);
spin_unlock_bh(&nf_conntrack_lock); local_bh_enable();
return -EEXIST; return -EEXIST;
} }
EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert);
...@@ -540,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb) ...@@ -540,6 +604,7 @@ __nf_conntrack_confirm(struct sk_buff *skb)
enum ip_conntrack_info ctinfo; enum ip_conntrack_info ctinfo;
struct net *net; struct net *net;
u16 zone; u16 zone;
unsigned int sequence;
ct = nf_ct_get(skb, &ctinfo); ct = nf_ct_get(skb, &ctinfo);
net = nf_ct_net(ct); net = nf_ct_net(ct);
...@@ -552,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb) ...@@ -552,31 +617,37 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT; return NF_ACCEPT;
zone = nf_ct_zone(ct); zone = nf_ct_zone(ct);
local_bh_disable();
do {
sequence = read_seqcount_begin(&net->ct.generation);
/* reuse the hash saved before */ /* reuse the hash saved before */
hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev;
hash = hash_bucket(hash, net); hash = hash_bucket(hash, net);
reply_hash = hash_conntrack(net, zone, reply_hash = hash_conntrack(net, zone,
&ct->tuplehash[IP_CT_DIR_REPLY].tuple); &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
} while (nf_conntrack_double_lock(net, hash, reply_hash, sequence));
/* We're not in hash table, and we refuse to set up related /* We're not in hash table, and we refuse to set up related
connections for unconfirmed conns. But packet copies and * connections for unconfirmed conns. But packet copies and
REJECT will give spurious warnings here. */ * REJECT will give spurious warnings here.
*/
/* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ /* NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
/* No external references means no one else could have /* No external references means no one else could have
confirmed us. */ * confirmed us.
*/
NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); NF_CT_ASSERT(!nf_ct_is_confirmed(ct));
pr_debug("Confirming conntrack %p\n", ct); pr_debug("Confirming conntrack %p\n", ct);
spin_lock_bh(&nf_conntrack_lock);
/* We have to check the DYING flag inside the lock to prevent /* We have to check the DYING flag inside the lock to prevent
a race against nf_ct_get_next_corpse() possibly called from a race against nf_ct_get_next_corpse() possibly called from
user context, else we insert an already 'dead' hash, blocking user context, else we insert an already 'dead' hash, blocking
further use of that particular connection -JM */ further use of that particular connection -JM */
if (unlikely(nf_ct_is_dying(ct))) { if (unlikely(nf_ct_is_dying(ct))) {
spin_unlock_bh(&nf_conntrack_lock); nf_conntrack_double_unlock(hash, reply_hash);
local_bh_enable();
return NF_ACCEPT; return NF_ACCEPT;
} }
...@@ -618,8 +689,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) ...@@ -618,8 +689,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
* stores are visible. * stores are visible.
*/ */
__nf_conntrack_hash_insert(ct, hash, reply_hash); __nf_conntrack_hash_insert(ct, hash, reply_hash);
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert); NF_CT_STAT_INC(net, insert);
spin_unlock_bh(&nf_conntrack_lock); local_bh_enable();
help = nfct_help(ct); help = nfct_help(ct);
if (help && help->helper) if (help && help->helper)
...@@ -630,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) ...@@ -630,8 +702,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
return NF_ACCEPT; return NF_ACCEPT;
out: out:
nf_conntrack_double_unlock(hash, reply_hash);
NF_CT_STAT_INC(net, insert_failed); NF_CT_STAT_INC(net, insert_failed);
spin_unlock_bh(&nf_conntrack_lock); local_bh_enable();
return NF_DROP; return NF_DROP;
} }
EXPORT_SYMBOL_GPL(__nf_conntrack_confirm); EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
...@@ -674,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken); ...@@ -674,39 +747,48 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
/* There's a small race here where we may free a just-assured /* There's a small race here where we may free a just-assured
connection. Too bad: we're in trouble anyway. */ connection. Too bad: we're in trouble anyway. */
static noinline int early_drop(struct net *net, unsigned int hash) static noinline int early_drop(struct net *net, unsigned int _hash)
{ {
/* Use oldest entry, which is roughly LRU */ /* Use oldest entry, which is roughly LRU */
struct nf_conntrack_tuple_hash *h; struct nf_conntrack_tuple_hash *h;
struct nf_conn *ct = NULL, *tmp; struct nf_conn *ct = NULL, *tmp;
struct hlist_nulls_node *n; struct hlist_nulls_node *n;
unsigned int i, cnt = 0; unsigned int i = 0, cnt = 0;
int dropped = 0; int dropped = 0;
unsigned int hash, sequence;
spinlock_t *lockp;
rcu_read_lock(); local_bh_disable();
for (i = 0; i < net->ct.htable_size; i++) { restart:
sequence = read_seqcount_begin(&net->ct.generation);
hash = hash_bucket(_hash, net);
for (; i < net->ct.htable_size; i++) {
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
spin_lock(lockp);
if (read_seqcount_retry(&net->ct.generation, sequence)) {
spin_unlock(lockp);
goto restart;
}
hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash],
hnnode) { hnnode) {
tmp = nf_ct_tuplehash_to_ctrack(h); tmp = nf_ct_tuplehash_to_ctrack(h);
if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) if (!test_bit(IPS_ASSURED_BIT, &tmp->status) &&
!nf_ct_is_dying(tmp) &&
atomic_inc_not_zero(&tmp->ct_general.use)) {
ct = tmp; ct = tmp;
break;
}
cnt++; cnt++;
} }
if (ct != NULL) { hash = (hash + 1) % net->ct.htable_size;
if (likely(!nf_ct_is_dying(ct) && spin_unlock(lockp);
atomic_inc_not_zero(&ct->ct_general.use)))
break;
else
ct = NULL;
}
if (cnt >= NF_CT_EVICTION_RANGE) if (ct || cnt >= NF_CT_EVICTION_RANGE)
break; break;
hash = (hash + 1) % net->ct.htable_size;
} }
rcu_read_unlock(); local_bh_enable();
if (!ct) if (!ct)
return dropped; return dropped;
...@@ -755,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone, ...@@ -755,7 +837,7 @@ __nf_conntrack_alloc(struct net *net, u16 zone,
if (nf_conntrack_max && if (nf_conntrack_max &&
unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
if (!early_drop(net, hash_bucket(hash, net))) { if (!early_drop(net, hash)) {
atomic_dec(&net->ct.count); atomic_dec(&net->ct.count);
net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -1304,9 +1386,13 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), ...@@ -1304,9 +1386,13 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
struct nf_conn *ct; struct nf_conn *ct;
struct hlist_nulls_node *n; struct hlist_nulls_node *n;
int cpu; int cpu;
spinlock_t *lockp;
spin_lock_bh(&nf_conntrack_lock);
for (; *bucket < net->ct.htable_size; (*bucket)++) { for (; *bucket < net->ct.htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
spin_lock(lockp);
if (*bucket < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) { hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue; continue;
...@@ -1315,7 +1401,9 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), ...@@ -1315,7 +1401,9 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
goto found; goto found;
} }
} }
spin_unlock_bh(&nf_conntrack_lock); spin_unlock(lockp);
local_bh_enable();
}
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu); struct ct_pcpu *pcpu = per_cpu_ptr(net->ct.pcpu_lists, cpu);
...@@ -1331,7 +1419,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data), ...@@ -1331,7 +1419,8 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
return NULL; return NULL;
found: found:
atomic_inc(&ct->ct_general.use); atomic_inc(&ct->ct_general.use);
spin_unlock_bh(&nf_conntrack_lock); spin_unlock(lockp);
local_bh_enable();
return ct; return ct;
} }
...@@ -1532,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) ...@@ -1532,12 +1621,16 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
if (!hash) if (!hash)
return -ENOMEM; return -ENOMEM;
local_bh_disable();
nf_conntrack_all_lock();
write_seqcount_begin(&init_net.ct.generation);
/* Lookups in the old hash might happen in parallel, which means we /* Lookups in the old hash might happen in parallel, which means we
* might get false negatives during connection lookup. New connections * might get false negatives during connection lookup. New connections
* created because of a false negative won't make it into the hash * created because of a false negative won't make it into the hash
* though since that required taking the lock. * though since that required taking the locks.
*/ */
spin_lock_bh(&nf_conntrack_lock);
for (i = 0; i < init_net.ct.htable_size; i++) { for (i = 0; i < init_net.ct.htable_size; i++) {
while (!hlist_nulls_empty(&init_net.ct.hash[i])) { while (!hlist_nulls_empty(&init_net.ct.hash[i])) {
h = hlist_nulls_entry(init_net.ct.hash[i].first, h = hlist_nulls_entry(init_net.ct.hash[i].first,
...@@ -1554,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) ...@@ -1554,7 +1647,10 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
init_net.ct.htable_size = nf_conntrack_htable_size = hashsize; init_net.ct.htable_size = nf_conntrack_htable_size = hashsize;
init_net.ct.hash = hash; init_net.ct.hash = hash;
spin_unlock_bh(&nf_conntrack_lock);
write_seqcount_end(&init_net.ct.generation);
nf_conntrack_all_unlock();
local_bh_enable();
nf_ct_free_hashtable(old_hash, old_size); nf_ct_free_hashtable(old_hash, old_size);
return 0; return 0;
...@@ -1576,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); ...@@ -1576,7 +1672,10 @@ EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or);
int nf_conntrack_init_start(void) int nf_conntrack_init_start(void)
{ {
int max_factor = 8; int max_factor = 8;
int ret, cpu; int i, ret, cpu;
for (i = 0; i < ARRAY_SIZE(nf_conntrack_locks); i++)
spin_lock_init(&nf_conntrack_locks[i]);
/* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB
* machine has 512 buckets. >= 1GB machines have 16384 buckets. */ * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
......
...@@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me, ...@@ -423,12 +423,16 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
unhelp(h, me); unhelp(h, me);
spin_unlock_bh(&pcpu->lock); spin_unlock_bh(&pcpu->lock);
} }
spin_lock_bh(&nf_conntrack_lock); local_bh_disable();
for (i = 0; i < net->ct.htable_size; i++) { for (i = 0; i < net->ct.htable_size; i++) {
spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
if (i < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode) hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
unhelp(h, me); unhelp(h, me);
} }
spin_unlock_bh(&nf_conntrack_lock); spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
}
local_bh_enable();
} }
void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me)
......
...@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -764,14 +764,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh); struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family; u_int8_t l3proto = nfmsg->nfgen_family;
int res; int res;
spinlock_t *lockp;
#ifdef CONFIG_NF_CONNTRACK_MARK #ifdef CONFIG_NF_CONNTRACK_MARK
const struct ctnetlink_dump_filter *filter = cb->data; const struct ctnetlink_dump_filter *filter = cb->data;
#endif #endif
spin_lock_bh(&nf_conntrack_lock);
last = (struct nf_conn *)cb->args[1]; last = (struct nf_conn *)cb->args[1];
local_bh_disable();
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) { for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
restart: restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
spin_lock(lockp);
if (cb->args[0] >= net->ct.htable_size) {
spin_unlock(lockp);
goto out;
}
hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]], hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
hnnode) { hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
...@@ -803,16 +812,18 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -803,16 +812,18 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
if (res < 0) { if (res < 0) {
nf_conntrack_get(&ct->ct_general); nf_conntrack_get(&ct->ct_general);
cb->args[1] = (unsigned long)ct; cb->args[1] = (unsigned long)ct;
spin_unlock(lockp);
goto out; goto out;
} }
} }
spin_unlock(lockp);
if (cb->args[1]) { if (cb->args[1]) {
cb->args[1] = 0; cb->args[1] = 0;
goto restart; goto restart;
} }
} }
out: out:
spin_unlock_bh(&nf_conntrack_lock); local_bh_enable();
if (last) if (last)
nf_ct_put(last); nf_ct_put(last);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment