Commit 180d8cd9 authored by Glauber Costa's avatar Glauber Costa Committed by David S. Miller

foundations of per-cgroup memory pressure controlling.

This patch replaces all uses of struct sock fields' memory_pressure,
memory_allocated, sockets_allocated, and sysctl_mem to acessor
macros. Those macros can either receive a socket argument, or a mem_cgroup
argument, depending on the context they live in.

Since we're only doing a macro wrapping here, no performance impact at all is
expected in the case where we don't have cgroups disabled.
Signed-off-by: default avatarGlauber Costa <glommer@parallels.com>
Reviewed-by: default avatarHiroyouki Kamezawa <kamezawa.hiroyu@jp.fujitsu.com>
CC: David S. Miller <davem@davemloft.net>
CC: Eric W. Biederman <ebiederm@xmission.com>
CC: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent e5671dfa
...@@ -53,6 +53,7 @@ ...@@ -53,6 +53,7 @@
#include <linux/security.h> #include <linux/security.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <linux/memcontrol.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/rculist_nulls.h> #include <linux/rculist_nulls.h>
...@@ -867,6 +868,99 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) ...@@ -867,6 +868,99 @@ static inline void sk_refcnt_debug_release(const struct sock *sk)
#define sk_refcnt_debug_release(sk) do { } while (0) #define sk_refcnt_debug_release(sk) do { } while (0)
#endif /* SOCK_REFCNT_DEBUG */ #endif /* SOCK_REFCNT_DEBUG */
static inline bool sk_has_memory_pressure(const struct sock *sk)
{
return sk->sk_prot->memory_pressure != NULL;
}
static inline bool sk_under_memory_pressure(const struct sock *sk)
{
if (!sk->sk_prot->memory_pressure)
return false;
return !!*sk->sk_prot->memory_pressure;
}
static inline void sk_leave_memory_pressure(struct sock *sk)
{
int *memory_pressure = sk->sk_prot->memory_pressure;
if (memory_pressure && *memory_pressure)
*memory_pressure = 0;
}
static inline void sk_enter_memory_pressure(struct sock *sk)
{
if (sk->sk_prot->enter_memory_pressure)
sk->sk_prot->enter_memory_pressure(sk);
}
static inline long sk_prot_mem_limits(const struct sock *sk, int index)
{
long *prot = sk->sk_prot->sysctl_mem;
return prot[index];
}
static inline long
sk_memory_allocated(const struct sock *sk)
{
struct proto *prot = sk->sk_prot;
return atomic_long_read(prot->memory_allocated);
}
static inline long
sk_memory_allocated_add(struct sock *sk, int amt)
{
struct proto *prot = sk->sk_prot;
return atomic_long_add_return(amt, prot->memory_allocated);
}
static inline void
sk_memory_allocated_sub(struct sock *sk, int amt)
{
struct proto *prot = sk->sk_prot;
atomic_long_sub(amt, prot->memory_allocated);
}
static inline void sk_sockets_allocated_dec(struct sock *sk)
{
struct proto *prot = sk->sk_prot;
percpu_counter_dec(prot->sockets_allocated);
}
static inline void sk_sockets_allocated_inc(struct sock *sk)
{
struct proto *prot = sk->sk_prot;
percpu_counter_inc(prot->sockets_allocated);
}
static inline int
sk_sockets_allocated_read_positive(struct sock *sk)
{
struct proto *prot = sk->sk_prot;
return percpu_counter_sum_positive(prot->sockets_allocated);
}
static inline int
proto_sockets_allocated_sum_positive(struct proto *prot)
{
return percpu_counter_sum_positive(prot->sockets_allocated);
}
static inline long
proto_memory_allocated(struct proto *prot)
{
return atomic_long_read(prot->memory_allocated);
}
static inline bool
proto_memory_pressure(struct proto *prot)
{
if (!prot->memory_pressure)
return false;
return !!*prot->memory_pressure;
}
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
/* Called with local bh disabled */ /* Called with local bh disabled */
...@@ -1674,7 +1768,7 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) ...@@ -1674,7 +1768,7 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk)
page = alloc_pages(sk->sk_allocation, 0); page = alloc_pages(sk->sk_allocation, 0);
if (!page) { if (!page) {
sk->sk_prot->enter_memory_pressure(sk); sk_enter_memory_pressure(sk);
sk_stream_moderate_sndbuf(sk); sk_stream_moderate_sndbuf(sk);
} }
return page; return page;
......
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
#include <net/dst.h> #include <net/dst.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/memcontrol.h>
extern struct inet_hashinfo tcp_hashinfo; extern struct inet_hashinfo tcp_hashinfo;
...@@ -285,7 +286,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift) ...@@ -285,7 +286,7 @@ static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
} }
if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
atomic_long_read(&tcp_memory_allocated) > sysctl_tcp_mem[2]) sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
return true; return true;
return false; return false;
} }
......
...@@ -1323,7 +1323,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) ...@@ -1323,7 +1323,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_wq = NULL; newsk->sk_wq = NULL;
if (newsk->sk_prot->sockets_allocated) if (newsk->sk_prot->sockets_allocated)
percpu_counter_inc(newsk->sk_prot->sockets_allocated); sk_sockets_allocated_inc(newsk);
if (newsk->sk_flags & SK_FLAGS_TIMESTAMP) if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp(); net_enable_timestamp();
...@@ -1713,28 +1713,28 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) ...@@ -1713,28 +1713,28 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
long allocated; long allocated;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
allocated = atomic_long_add_return(amt, prot->memory_allocated);
allocated = sk_memory_allocated_add(sk, amt);
/* Under limit. */ /* Under limit. */
if (allocated <= prot->sysctl_mem[0]) { if (allocated <= sk_prot_mem_limits(sk, 0)) {
if (prot->memory_pressure && *prot->memory_pressure) sk_leave_memory_pressure(sk);
*prot->memory_pressure = 0;
return 1; return 1;
} }
/* Under pressure. */ /* Under pressure. */
if (allocated > prot->sysctl_mem[1]) if (allocated > sk_prot_mem_limits(sk, 1))
if (prot->enter_memory_pressure) sk_enter_memory_pressure(sk);
prot->enter_memory_pressure(sk);
/* Over hard limit. */ /* Over hard limit. */
if (allocated > prot->sysctl_mem[2]) if (allocated > sk_prot_mem_limits(sk, 2))
goto suppress_allocation; goto suppress_allocation;
/* guarantee minimum buffer size under pressure */ /* guarantee minimum buffer size under pressure */
if (kind == SK_MEM_RECV) { if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
return 1; return 1;
} else { /* SK_MEM_SEND */ } else { /* SK_MEM_SEND */
if (sk->sk_type == SOCK_STREAM) { if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
...@@ -1744,13 +1744,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) ...@@ -1744,13 +1744,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
return 1; return 1;
} }
if (prot->memory_pressure) { if (sk_has_memory_pressure(sk)) {
int alloc; int alloc;
if (!*prot->memory_pressure) if (!sk_under_memory_pressure(sk))
return 1; return 1;
alloc = percpu_counter_read_positive(prot->sockets_allocated); alloc = sk_sockets_allocated_read_positive(sk);
if (prot->sysctl_mem[2] > alloc * if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued + sk_mem_pages(sk->sk_wmem_queued +
atomic_read(&sk->sk_rmem_alloc) + atomic_read(&sk->sk_rmem_alloc) +
sk->sk_forward_alloc)) sk->sk_forward_alloc))
...@@ -1773,7 +1773,9 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind) ...@@ -1773,7 +1773,9 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
/* Alas. Undo changes. */ /* Alas. Undo changes. */
sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
atomic_long_sub(amt, prot->memory_allocated);
sk_memory_allocated_sub(sk, amt);
return 0; return 0;
} }
EXPORT_SYMBOL(__sk_mem_schedule); EXPORT_SYMBOL(__sk_mem_schedule);
...@@ -1784,15 +1786,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); ...@@ -1784,15 +1786,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/ */
void __sk_mem_reclaim(struct sock *sk) void __sk_mem_reclaim(struct sock *sk)
{ {
struct proto *prot = sk->sk_prot; sk_memory_allocated_sub(sk,
sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
prot->memory_allocated);
sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
if (prot->memory_pressure && *prot->memory_pressure && if (sk_under_memory_pressure(sk) &&
(atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0])) (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
*prot->memory_pressure = 0; sk_leave_memory_pressure(sk);
} }
EXPORT_SYMBOL(__sk_mem_reclaim); EXPORT_SYMBOL(__sk_mem_reclaim);
...@@ -2507,16 +2507,27 @@ static char proto_method_implemented(const void *method) ...@@ -2507,16 +2507,27 @@ static char proto_method_implemented(const void *method)
{ {
return method == NULL ? 'n' : 'y'; return method == NULL ? 'n' : 'y';
} }
static long sock_prot_memory_allocated(struct proto *proto)
{
return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
}
static char *sock_prot_memory_pressure(struct proto *proto)
{
return proto->memory_pressure != NULL ?
proto_memory_pressure(proto) ? "yes" : "no" : "NI";
}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto) static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{ {
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name, proto->name,
proto->obj_size, proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto), sock_prot_inuse_get(seq_file_net(seq), proto),
proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L, sock_prot_memory_allocated(proto),
proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", sock_prot_memory_pressure(proto),
proto->max_header, proto->max_header,
proto->slab == NULL ? "no" : "yes", proto->slab == NULL ? "no" : "yes",
module_name(proto->owner), module_name(proto->owner),
......
...@@ -56,17 +56,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) ...@@ -56,17 +56,17 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
local_bh_disable(); local_bh_disable();
orphans = percpu_counter_sum_positive(&tcp_orphan_count); orphans = percpu_counter_sum_positive(&tcp_orphan_count);
sockets = percpu_counter_sum_positive(&tcp_sockets_allocated); sockets = proto_sockets_allocated_sum_positive(&tcp_prot);
local_bh_enable(); local_bh_enable();
socket_seq_show(seq); socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
sock_prot_inuse_get(net, &tcp_prot), orphans, sock_prot_inuse_get(net, &tcp_prot), orphans,
tcp_death_row.tw_count, sockets, tcp_death_row.tw_count, sockets,
atomic_long_read(&tcp_memory_allocated)); proto_memory_allocated(&tcp_prot));
seq_printf(seq, "UDP: inuse %d mem %ld\n", seq_printf(seq, "UDP: inuse %d mem %ld\n",
sock_prot_inuse_get(net, &udp_prot), sock_prot_inuse_get(net, &udp_prot),
atomic_long_read(&udp_memory_allocated)); proto_memory_allocated(&udp_prot));
seq_printf(seq, "UDPLITE: inuse %d\n", seq_printf(seq, "UDPLITE: inuse %d\n",
sock_prot_inuse_get(net, &udplite_prot)); sock_prot_inuse_get(net, &udplite_prot));
seq_printf(seq, "RAW: inuse %d\n", seq_printf(seq, "RAW: inuse %d\n",
......
...@@ -322,7 +322,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) ...@@ -322,7 +322,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb)
/* Check #1 */ /* Check #1 */
if (tp->rcv_ssthresh < tp->window_clamp && if (tp->rcv_ssthresh < tp->window_clamp &&
(int)tp->rcv_ssthresh < tcp_space(sk) && (int)tp->rcv_ssthresh < tcp_space(sk) &&
!tcp_memory_pressure) { !sk_under_memory_pressure(sk)) {
int incr; int incr;
/* Check #2. Increase window, if skb with such overhead /* Check #2. Increase window, if skb with such overhead
...@@ -411,8 +411,8 @@ static void tcp_clamp_window(struct sock *sk) ...@@ -411,8 +411,8 @@ static void tcp_clamp_window(struct sock *sk)
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_memory_pressure && !sk_under_memory_pressure(sk) &&
atomic_long_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
sysctl_tcp_rmem[2]); sysctl_tcp_rmem[2]);
} }
...@@ -4866,7 +4866,7 @@ static int tcp_prune_queue(struct sock *sk) ...@@ -4866,7 +4866,7 @@ static int tcp_prune_queue(struct sock *sk)
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
tcp_clamp_window(sk); tcp_clamp_window(sk);
else if (tcp_memory_pressure) else if (sk_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
tcp_collapse_ofo_queue(sk); tcp_collapse_ofo_queue(sk);
...@@ -4932,11 +4932,11 @@ static int tcp_should_expand_sndbuf(const struct sock *sk) ...@@ -4932,11 +4932,11 @@ static int tcp_should_expand_sndbuf(const struct sock *sk)
return 0; return 0;
/* If we are under global TCP memory pressure, do not expand. */ /* If we are under global TCP memory pressure, do not expand. */
if (tcp_memory_pressure) if (sk_under_memory_pressure(sk))
return 0; return 0;
/* If we are under soft global TCP memory pressure, do not expand. */ /* If we are under soft global TCP memory pressure, do not expand. */
if (atomic_long_read(&tcp_memory_allocated) >= sysctl_tcp_mem[0]) if (sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
return 0; return 0;
/* If we filled the congestion window, do not expand. */ /* If we filled the congestion window, do not expand. */
......
...@@ -1917,7 +1917,7 @@ static int tcp_v4_init_sock(struct sock *sk) ...@@ -1917,7 +1917,7 @@ static int tcp_v4_init_sock(struct sock *sk)
sk->sk_rcvbuf = sysctl_tcp_rmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1];
local_bh_disable(); local_bh_disable();
percpu_counter_inc(&tcp_sockets_allocated); sk_sockets_allocated_inc(sk);
local_bh_enable(); local_bh_enable();
return 0; return 0;
...@@ -1973,7 +1973,7 @@ void tcp_v4_destroy_sock(struct sock *sk) ...@@ -1973,7 +1973,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
tp->cookie_values = NULL; tp->cookie_values = NULL;
} }
percpu_counter_dec(&tcp_sockets_allocated); sk_sockets_allocated_dec(sk);
} }
EXPORT_SYMBOL(tcp_v4_destroy_sock); EXPORT_SYMBOL(tcp_v4_destroy_sock);
......
...@@ -1922,7 +1922,7 @@ u32 __tcp_select_window(struct sock *sk) ...@@ -1922,7 +1922,7 @@ u32 __tcp_select_window(struct sock *sk)
if (free_space < (full_space >> 1)) { if (free_space < (full_space >> 1)) {
icsk->icsk_ack.quick = 0; icsk->icsk_ack.quick = 0;
if (tcp_memory_pressure) if (sk_under_memory_pressure(sk))
tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->rcv_ssthresh = min(tp->rcv_ssthresh,
4U * tp->advmss); 4U * tp->advmss);
......
...@@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data) ...@@ -261,7 +261,7 @@ static void tcp_delack_timer(unsigned long data)
} }
out: out:
if (tcp_memory_pressure) if (sk_under_memory_pressure(sk))
sk_mem_reclaim(sk); sk_mem_reclaim(sk);
out_unlock: out_unlock:
bh_unlock_sock(sk); bh_unlock_sock(sk);
......
...@@ -1994,7 +1994,7 @@ static int tcp_v6_init_sock(struct sock *sk) ...@@ -1994,7 +1994,7 @@ static int tcp_v6_init_sock(struct sock *sk)
sk->sk_rcvbuf = sysctl_tcp_rmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1];
local_bh_disable(); local_bh_disable();
percpu_counter_inc(&tcp_sockets_allocated); sk_sockets_allocated_inc(sk);
local_bh_enable(); local_bh_enable();
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment