Commit 5d48ef3e authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp_mem_pressure'

Eric Dumazet says:

====================
tcp: better handling of memory pressure

When testing commit 790ba456 ("tcp: set SOCK_NOSPACE under memory
pressure") using edge triggered epoll applications, I found various
issues under memory pressure and thousands of active sockets.

This patch series is a first round to solve these issues, in send
and receive paths. There are probably other fixes needed, but
with this series, my tests now all succeed.

v2: fix typo in "allow one skb to be received per socket under memory pressure",
as spotted by Jason Baron.
====================
Acked-by: default avatarJason Baron <jbaron@akamai.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 4633c9e0 b66e91cc
...@@ -1368,7 +1368,7 @@ static inline struct inode *SOCK_INODE(struct socket *socket) ...@@ -1368,7 +1368,7 @@ static inline struct inode *SOCK_INODE(struct socket *socket)
* Functions for memory accounting * Functions for memory accounting
*/ */
int __sk_mem_schedule(struct sock *sk, int size, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind);
void __sk_mem_reclaim(struct sock *sk); void __sk_mem_reclaim(struct sock *sk, int amount);
#define SK_MEM_QUANTUM ((int)PAGE_SIZE) #define SK_MEM_QUANTUM ((int)PAGE_SIZE)
#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM)
...@@ -1409,7 +1409,7 @@ static inline void sk_mem_reclaim(struct sock *sk) ...@@ -1409,7 +1409,7 @@ static inline void sk_mem_reclaim(struct sock *sk)
if (!sk_has_account(sk)) if (!sk_has_account(sk))
return; return;
if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) if (sk->sk_forward_alloc >= SK_MEM_QUANTUM)
__sk_mem_reclaim(sk); __sk_mem_reclaim(sk, sk->sk_forward_alloc);
} }
static inline void sk_mem_reclaim_partial(struct sock *sk) static inline void sk_mem_reclaim_partial(struct sock *sk)
...@@ -1417,7 +1417,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk) ...@@ -1417,7 +1417,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk)
if (!sk_has_account(sk)) if (!sk_has_account(sk))
return; return;
if (sk->sk_forward_alloc > SK_MEM_QUANTUM) if (sk->sk_forward_alloc > SK_MEM_QUANTUM)
__sk_mem_reclaim(sk); __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1);
} }
static inline void sk_mem_charge(struct sock *sk, int size) static inline void sk_mem_charge(struct sock *sk, int size)
......
...@@ -286,6 +286,14 @@ extern atomic_long_t tcp_memory_allocated; ...@@ -286,6 +286,14 @@ extern atomic_long_t tcp_memory_allocated;
extern struct percpu_counter tcp_sockets_allocated; extern struct percpu_counter tcp_sockets_allocated;
extern int tcp_memory_pressure; extern int tcp_memory_pressure;
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
return !!sk->sk_cgrp->memory_pressure;
return tcp_memory_pressure;
}
/* /*
* The next routines deal with comparing 32 bit unsigned ints * The next routines deal with comparing 32 bit unsigned ints
* and worry about wraparound (automatic with unsigned arithmetic). * and worry about wraparound (automatic with unsigned arithmetic).
...@@ -311,6 +319,8 @@ static inline bool tcp_out_of_memory(struct sock *sk) ...@@ -311,6 +319,8 @@ static inline bool tcp_out_of_memory(struct sock *sk)
return false; return false;
} }
void sk_forced_mem_schedule(struct sock *sk, int size);
static inline bool tcp_too_many_orphans(struct sock *sk, int shift) static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
{ {
struct percpu_counter *ocp = sk->sk_prot->orphan_count; struct percpu_counter *ocp = sk->sk_prot->orphan_count;
......
...@@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); ...@@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
/** /**
* __sk_reclaim - reclaim memory_allocated * __sk_reclaim - reclaim memory_allocated
* @sk: socket * @sk: socket
* @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
*/ */
void __sk_mem_reclaim(struct sock *sk) void __sk_mem_reclaim(struct sock *sk, int amount)
{ {
sk_memory_allocated_sub(sk, amount >>= SK_MEM_QUANTUM_SHIFT;
sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk_memory_allocated_sub(sk, amount);
sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
if (sk_under_memory_pressure(sk) && if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
......
...@@ -815,9 +815,20 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) ...@@ -815,9 +815,20 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
/* The TCP header must be at least 32-bit aligned. */ /* The TCP header must be at least 32-bit aligned. */
size = ALIGN(size, 4); size = ALIGN(size, 4);
if (unlikely(tcp_under_memory_pressure(sk)))
sk_mem_reclaim_partial(sk);
skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
if (skb) { if (likely(skb)) {
if (sk_wmem_schedule(sk, skb->truesize)) { bool mem_schedule;
if (skb_queue_len(&sk->sk_write_queue) == 0) {
mem_schedule = true;
sk_forced_mem_schedule(sk, skb->truesize);
} else {
mem_schedule = sk_wmem_schedule(sk, skb->truesize);
}
if (likely(mem_schedule)) {
skb_reserve(skb, sk->sk_prot->max_header); skb_reserve(skb, sk->sk_prot->max_header);
/* /*
* Make sure that we have exactly size bytes * Make sure that we have exactly size bytes
...@@ -3057,11 +3068,12 @@ __setup("thash_entries=", set_thash_entries); ...@@ -3057,11 +3068,12 @@ __setup("thash_entries=", set_thash_entries);
static void __init tcp_init_mem(void) static void __init tcp_init_mem(void)
{ {
unsigned long limit = nr_free_buffer_pages() / 8; unsigned long limit = nr_free_buffer_pages() / 16;
limit = max(limit, 128UL); limit = max(limit, 128UL);
sysctl_tcp_mem[0] = limit / 4 * 3; sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */
sysctl_tcp_mem[1] = limit; sysctl_tcp_mem[1] = limit; /* 6.25 % */
sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */
} }
void __init tcp_init(void) void __init tcp_init(void)
......
...@@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) ...@@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */ /* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp && if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) && (int)tp->rcv_ssthresh < tcp_space(sk) &&
!sk_under_memory_pressure(sk)) { !tcp_under_memory_pressure(sk)) {
int incr; int incr;
/* Check #2. Increase window, if skb with such overhead /* Check #2. Increase window, if skb with such overhead
...@@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk) ...@@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!sk_under_memory_pressure(sk) && !tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]); sysctl_tcp_rmem[2]);
...@@ -4507,10 +4507,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) ...@@ -4507,10 +4507,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
if (eaten <= 0) { if (eaten <= 0) {
queue_and_out: queue_and_out:
if (eaten < 0 && if (eaten < 0) {
tcp_try_rmem_schedule(sk, skb, skb->truesize)) if (skb_queue_len(&sk->sk_receive_queue) == 0)
goto drop; sk_forced_mem_schedule(sk, skb->truesize);
else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto drop;
}
eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
} }
tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq);
...@@ -4781,7 +4783,7 @@ static int tcp_prune_queue(struct sock *sk) ...@@ -4781,7 +4783,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk); tcp_clamp_window(sk);
else if (sk_under_memory_pressure(sk)) else if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk); tcp_collapse_ofo_queue(sk);
...@@ -4825,7 +4827,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) ...@@ -4825,7 +4827,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk)
return false; return false;
/* If we are under global TCP memory pressure, do not expand. */ /* If we are under global TCP memory pressure, do not expand. */
if (sk_under_memory_pressure(sk)) if (tcp_under_memory_pressure(sk))
return false; return false;
/* If we are under soft global TCP memory pressure, do not expand. */ /* If we are under soft global TCP memory pressure, do not expand. */
......
...@@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk) ...@@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) { if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0; icsk->icsk_ack.quick = 0;
if (sk_under_memory_pressure(sk)) if (tcp_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss); 4U * tp->advmss);
...@@ -2816,8 +2816,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk) ...@@ -2816,8 +2816,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
* connection tear down and (memory) recovery. * connection tear down and (memory) recovery.
* Otherwise tcp_send_fin() could be tempted to either delay FIN * Otherwise tcp_send_fin() could be tempted to either delay FIN
* or even be forced to close flow without any FIN. * or even be forced to close flow without any FIN.
* In general, we want to allow one skb per socket to avoid hangs
* with edge trigger epoll()
*/ */
static void sk_forced_wmem_schedule(struct sock *sk, int size) void sk_forced_mem_schedule(struct sock *sk, int size)
{ {
int amt, status; int amt, status;
...@@ -2841,7 +2843,7 @@ void tcp_send_fin(struct sock *sk) ...@@ -2841,7 +2843,7 @@ void tcp_send_fin(struct sock *sk)
* Note: in the latter case, FIN packet will be sent after a timeout, * Note: in the latter case, FIN packet will be sent after a timeout,
* as TCP stack thinks it has already been transmitted. * as TCP stack thinks it has already been transmitted.
*/ */
if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) {
coalesce: coalesce:
TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
TCP_SKB_CB(tskb)->end_seq++; TCP_SKB_CB(tskb)->end_seq++;
...@@ -2864,7 +2866,7 @@ void tcp_send_fin(struct sock *sk) ...@@ -2864,7 +2866,7 @@ void tcp_send_fin(struct sock *sk)
return; return;
} }
skb_reserve(skb, MAX_TCP_HEADER); skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_wmem_schedule(sk, skb->truesize); sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
tcp_init_nondata_skb(skb, tp->write_seq, tcp_init_nondata_skb(skb, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN); TCPHDR_ACK | TCPHDR_FIN);
......
...@@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk) ...@@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk)
} }
out: out:
if (sk_under_memory_pressure(sk)) if (tcp_under_memory_pressure(sk))
sk_mem_reclaim(sk); sk_mem_reclaim(sk);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment