Commit 19757ceb authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: switch orphan_count to bare per-cpu counters

Use of percpu_counter structure to track count of orphaned
sockets is causing problems on modern hosts with 256 cpus
or more.

Stefan Bach reported a serious spinlock contention in real workloads,
that I was able to reproduce with a netfilter rule dropping
incoming FIN packets.

    53.56%  server  [kernel.kallsyms]      [k] queued_spin_lock_slowpath
            |
            ---queued_spin_lock_slowpath
               |
                --53.51%--_raw_spin_lock_irqsave
                          |
                           --53.51%--__percpu_counter_sum
                                     tcp_check_oom
                                     |
                                     |--39.03%--__tcp_close
                                     |          tcp_close
                                     |          inet_release
                                     |          inet6_release
                                     |          sock_close
                                     |          __fput
                                     |          ____fput
                                     |          task_work_run
                                     |          exit_to_usermode_loop
                                     |          do_syscall_64
                                     |          entry_SYSCALL_64_after_hwframe
                                     |          __GI___libc_close
                                     |
                                      --14.48%--tcp_out_of_resources
                                                tcp_write_timeout
                                                tcp_retransmit_timer
                                                tcp_write_timer_handler
                                                tcp_write_timer
                                                call_timer_fn
                                                expire_timers
                                                __run_timers
                                                run_timer_softirq
                                                __softirqentry_text_start

As explained in commit cf86a086 ("net/dst: use a smaller percpu_counter
batch for dst entries accounting"), default batch size is too big
for the default value of tcp_max_orphans (262144).

But even if we reduce batch sizes, there would still be cases
where the estimated count of orphans is beyond the limit,
and where tcp_too_many_orphans() has to call the expensive
percpu_counter_sum_positive().

One solution is to use plain per-cpu counters, and have
a timer to periodically refresh this cache.

Updating this cache every 100ms seems about right, tcp pressure
state is not radically changing over shorter periods.

percpu_counter was nice 15 years ago while hosts had less
than 16 cpus, not anymore by current standards.

v2: Fix the build issue for CONFIG_CRYPTO_DEV_CHELSIO_TLS=m,
    reported by kernel test robot <lkp@intel.com>
    Remove unused socket argument from tcp_too_many_orphans()

Fixes: dd24c001 ("net: Use a percpu_counter for orphan_count")
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Reported-by: default avatarStefan Bach <sfb@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0b93aed2
...@@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent) ...@@ -870,7 +870,7 @@ static void do_abort_syn_rcv(struct sock *child, struct sock *parent)
* created only after 3 way handshake is done. * created only after 3 way handshake is done.
*/ */
sock_orphan(child); sock_orphan(child);
percpu_counter_inc((child)->sk_prot->orphan_count); INC_ORPHAN_COUNT(child);
chtls_release_resources(child); chtls_release_resources(child);
chtls_conn_done(child); chtls_conn_done(child);
} else { } else {
......
...@@ -95,7 +95,7 @@ struct deferred_skb_cb { ...@@ -95,7 +95,7 @@ struct deferred_skb_cb {
#define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok) #define WSCALE_OK(tp) ((tp)->rx_opt.wscale_ok)
#define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok) #define TSTAMP_OK(tp) ((tp)->rx_opt.tstamp_ok)
#define SACK_OK(tp) ((tp)->rx_opt.sack_ok) #define SACK_OK(tp) ((tp)->rx_opt.sack_ok)
#define INC_ORPHAN_COUNT(sk) percpu_counter_inc((sk)->sk_prot->orphan_count) #define INC_ORPHAN_COUNT(sk) this_cpu_inc(*(sk)->sk_prot->orphan_count)
/* TLS SKB */ /* TLS SKB */
#define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld) #define skb_ulp_tls_inline(skb) (ULP_SKB_CB(skb)->ulp.tls.ofld)
......
...@@ -289,7 +289,7 @@ static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk) ...@@ -289,7 +289,7 @@ static inline void inet_csk_prepare_for_destroy_sock(struct sock *sk)
{ {
/* The below has to be done to allow calling inet_csk_destroy_sock */ /* The below has to be done to allow calling inet_csk_destroy_sock */
sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DEAD);
percpu_counter_inc(sk->sk_prot->orphan_count); this_cpu_inc(*sk->sk_prot->orphan_count);
} }
void inet_csk_destroy_sock(struct sock *sk); void inet_csk_destroy_sock(struct sock *sk);
......
...@@ -1235,7 +1235,7 @@ struct proto { ...@@ -1235,7 +1235,7 @@ struct proto {
unsigned int useroffset; /* Usercopy region offset */ unsigned int useroffset; /* Usercopy region offset */
unsigned int usersize; /* Usercopy region size */ unsigned int usersize; /* Usercopy region size */
struct percpu_counter *orphan_count; unsigned int __percpu *orphan_count;
struct request_sock_ops *rsk_prot; struct request_sock_ops *rsk_prot;
struct timewait_sock_ops *twsk_prot; struct timewait_sock_ops *twsk_prot;
......
...@@ -48,7 +48,9 @@ ...@@ -48,7 +48,9 @@
extern struct inet_hashinfo tcp_hashinfo; extern struct inet_hashinfo tcp_hashinfo;
extern struct percpu_counter tcp_orphan_count; DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);
void tcp_time_wait(struct sock *sk, int state, int timeo); void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER)
...@@ -290,19 +292,6 @@ static inline bool tcp_out_of_memory(struct sock *sk) ...@@ -290,19 +292,6 @@ static inline bool tcp_out_of_memory(struct sock *sk)
void sk_forced_mem_schedule(struct sock *sk, int size); void sk_forced_mem_schedule(struct sock *sk, int size);
static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
{
struct percpu_counter *ocp = sk->sk_prot->orphan_count;
int orphans = percpu_counter_read_positive(ocp);
if (orphans << shift > sysctl_tcp_max_orphans) {
orphans = percpu_counter_sum_positive(ocp);
if (orphans << shift > sysctl_tcp_max_orphans)
return true;
}
return false;
}
bool tcp_check_oom(struct sock *sk, int shift); bool tcp_check_oom(struct sock *sk, int shift);
......
...@@ -48,7 +48,7 @@ extern bool dccp_debug; ...@@ -48,7 +48,7 @@ extern bool dccp_debug;
extern struct inet_hashinfo dccp_hashinfo; extern struct inet_hashinfo dccp_hashinfo;
extern struct percpu_counter dccp_orphan_count; DECLARE_PER_CPU(unsigned int, dccp_orphan_count);
void dccp_time_wait(struct sock *sk, int state, int timeo); void dccp_time_wait(struct sock *sk, int state, int timeo);
......
...@@ -42,8 +42,8 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; ...@@ -42,8 +42,8 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
EXPORT_SYMBOL_GPL(dccp_statistics); EXPORT_SYMBOL_GPL(dccp_statistics);
struct percpu_counter dccp_orphan_count; DEFINE_PER_CPU(unsigned int, dccp_orphan_count);
EXPORT_SYMBOL_GPL(dccp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(dccp_orphan_count);
struct inet_hashinfo dccp_hashinfo; struct inet_hashinfo dccp_hashinfo;
EXPORT_SYMBOL_GPL(dccp_hashinfo); EXPORT_SYMBOL_GPL(dccp_hashinfo);
...@@ -1055,7 +1055,7 @@ void dccp_close(struct sock *sk, long timeout) ...@@ -1055,7 +1055,7 @@ void dccp_close(struct sock *sk, long timeout)
bh_lock_sock(sk); bh_lock_sock(sk);
WARN_ON(sock_owned_by_user(sk)); WARN_ON(sock_owned_by_user(sk));
percpu_counter_inc(sk->sk_prot->orphan_count); this_cpu_inc(dccp_orphan_count);
/* Have we already been destroyed by a softirq or backlog? */ /* Have we already been destroyed by a softirq or backlog? */
if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED) if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
...@@ -1115,13 +1115,10 @@ static int __init dccp_init(void) ...@@ -1115,13 +1115,10 @@ static int __init dccp_init(void)
BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
sizeof_field(struct sk_buff, cb)); sizeof_field(struct sk_buff, cb));
rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
if (rc)
goto out_fail;
inet_hashinfo_init(&dccp_hashinfo); inet_hashinfo_init(&dccp_hashinfo);
rc = inet_hashinfo2_init_mod(&dccp_hashinfo); rc = inet_hashinfo2_init_mod(&dccp_hashinfo);
if (rc) if (rc)
goto out_free_percpu; goto out_fail;
rc = -ENOBUFS; rc = -ENOBUFS;
dccp_hashinfo.bind_bucket_cachep = dccp_hashinfo.bind_bucket_cachep =
kmem_cache_create("dccp_bind_bucket", kmem_cache_create("dccp_bind_bucket",
...@@ -1226,8 +1223,6 @@ static int __init dccp_init(void) ...@@ -1226,8 +1223,6 @@ static int __init dccp_init(void)
kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
out_free_hashinfo2: out_free_hashinfo2:
inet_hashinfo2_free_mod(&dccp_hashinfo); inet_hashinfo2_free_mod(&dccp_hashinfo);
out_free_percpu:
percpu_counter_destroy(&dccp_orphan_count);
out_fail: out_fail:
dccp_hashinfo.bhash = NULL; dccp_hashinfo.bhash = NULL;
dccp_hashinfo.ehash = NULL; dccp_hashinfo.ehash = NULL;
...@@ -1250,7 +1245,6 @@ static void __exit dccp_fini(void) ...@@ -1250,7 +1245,6 @@ static void __exit dccp_fini(void)
dccp_ackvec_exit(); dccp_ackvec_exit();
dccp_sysctl_exit(); dccp_sysctl_exit();
inet_hashinfo2_free_mod(&dccp_hashinfo); inet_hashinfo2_free_mod(&dccp_hashinfo);
percpu_counter_destroy(&dccp_orphan_count);
} }
module_init(dccp_init); module_init(dccp_init);
......
...@@ -1015,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk) ...@@ -1015,7 +1015,7 @@ void inet_csk_destroy_sock(struct sock *sk)
sk_refcnt_debug_release(sk); sk_refcnt_debug_release(sk);
percpu_counter_dec(sk->sk_prot->orphan_count); this_cpu_dec(*sk->sk_prot->orphan_count);
sock_put(sk); sock_put(sk);
} }
...@@ -1074,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, ...@@ -1074,7 +1074,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
sock_orphan(child); sock_orphan(child);
percpu_counter_inc(sk->sk_prot->orphan_count); this_cpu_inc(*sk->sk_prot->orphan_count);
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
......
...@@ -598,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) ...@@ -598,7 +598,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ok) { if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else { } else {
percpu_counter_inc(sk->sk_prot->orphan_count); this_cpu_inc(*sk->sk_prot->orphan_count);
inet_sk_set_state(sk, TCP_CLOSE); inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD); sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk); inet_csk_destroy_sock(sk);
......
...@@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) ...@@ -53,7 +53,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
struct net *net = seq->private; struct net *net = seq->private;
int orphans, sockets; int orphans, sockets;
orphans = percpu_counter_sum_positive(&tcp_orphan_count); orphans = tcp_orphan_count_sum();
sockets = proto_sockets_allocated_sum_positive(&tcp_prot); sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
socket_seq_show(seq); socket_seq_show(seq);
......
...@@ -287,8 +287,8 @@ enum { ...@@ -287,8 +287,8 @@ enum {
TCP_CMSG_TS = 2 TCP_CMSG_TS = 2
}; };
struct percpu_counter tcp_orphan_count; DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
EXPORT_SYMBOL_GPL(tcp_orphan_count); EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
long sysctl_tcp_mem[3] __read_mostly; long sysctl_tcp_mem[3] __read_mostly;
EXPORT_SYMBOL(sysctl_tcp_mem); EXPORT_SYMBOL(sysctl_tcp_mem);
...@@ -2673,11 +2673,36 @@ void tcp_shutdown(struct sock *sk, int how) ...@@ -2673,11 +2673,36 @@ void tcp_shutdown(struct sock *sk, int how)
} }
EXPORT_SYMBOL(tcp_shutdown); EXPORT_SYMBOL(tcp_shutdown);
int tcp_orphan_count_sum(void)
{
int i, total = 0;
for_each_possible_cpu(i)
total += per_cpu(tcp_orphan_count, i);
return max(total, 0);
}
static int tcp_orphan_cache;
static struct timer_list tcp_orphan_timer;
#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
static void tcp_orphan_update(struct timer_list *unused)
{
WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
}
static bool tcp_too_many_orphans(int shift)
{
return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
}
bool tcp_check_oom(struct sock *sk, int shift) bool tcp_check_oom(struct sock *sk, int shift)
{ {
bool too_many_orphans, out_of_socket_memory; bool too_many_orphans, out_of_socket_memory;
too_many_orphans = tcp_too_many_orphans(sk, shift); too_many_orphans = tcp_too_many_orphans(shift);
out_of_socket_memory = tcp_out_of_memory(sk); out_of_socket_memory = tcp_out_of_memory(sk);
if (too_many_orphans) if (too_many_orphans)
...@@ -2786,7 +2811,7 @@ void __tcp_close(struct sock *sk, long timeout) ...@@ -2786,7 +2811,7 @@ void __tcp_close(struct sock *sk, long timeout)
/* remove backlog if any, without releasing ownership. */ /* remove backlog if any, without releasing ownership. */
__release_sock(sk); __release_sock(sk);
percpu_counter_inc(sk->sk_prot->orphan_count); this_cpu_inc(tcp_orphan_count);
/* Have we already been destroyed by a softirq or backlog? */ /* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
...@@ -4479,7 +4504,10 @@ void __init tcp_init(void) ...@@ -4479,7 +4504,10 @@ void __init tcp_init(void)
sizeof_field(struct sk_buff, cb)); sizeof_field(struct sk_buff, cb));
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
inet_hashinfo_init(&tcp_hashinfo); inet_hashinfo_init(&tcp_hashinfo);
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash", inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
thash_entries, 21, /* one slot per 2 MB*/ thash_entries, 21, /* one slot per 2 MB*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment