Commit f8506c57 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf-reduce-memory-usage-for-bpf_global_percpu_ma'

Yonghong Song says:

====================
bpf: Reduce memory usage for bpf_global_percpu_ma

Currently when a bpf program intends to allocate memory for percpu kptr,
the verifier will call bpf_mem_alloc_init() to prefill all supported
unit sizes and this caused memory consumption very big for large number
of cpus. For example, for 128-cpu system, the total memory consumption
with initial prefill is ~175MB. Things will become worse for systems
with even more cpus.

Patch 1 avoids unnecessary extra percpu memory allocation.
Patch 2 adds objcg to bpf_mem_alloc at init stage so objcg can be
associated with root cgroup and objcg can be passed to later
bpf_mem_alloc_percpu_unit_init().
Patch 3 addresses memory consumption issue by avoiding to prefill
with all unit sizes, i.e. only prefilling with user specified size.
Patch 4 further reduces memory consumption by limiting the
number of prefill entries for percpu memory allocation.
Patch 5 has much smaller low/high watermarks for percpu allocation
to reduce memory consumption.
Patch 6 rejects percpu memory allocation with bpf_global_percpu_ma
when allocation size is greater than 512 bytes.
Patch 7 fixed test_bpf_ma test due to Patch 5.
Patch 8 added one test to show the verification failure log message.

Changelogs:
  v5 -> v6:
    . Change bpf_mem_alloc_percpu_init() to add objcg as one of parameters.
      For bpf_global_percpu_ma, the objcg is NULL, corresponding root memcg.
  v4 -> v5:
    . Do not do bpf_global_percpu_ma initialization at init stage, instead
      doing initialization when the verifier knows it is going to be used
      by bpf prog.
    . Using much smaller low/high watermarks for percpu allocation.
  v3 -> v4:
    . Add objcg to bpf_mem_alloc during init stage.
    . Initialize objcg at init stage but use it in bpf_mem_alloc_percpu_unit_init().
    . Remove check_obj_size() in bpf_mem_alloc_percpu_unit_init().
  v2 -> v3:
    . Clear the bpf_mem_cache if prefill fails.
    . Change test_bpf_ma percpu allocation tests to use bucket_size
      as allocation size instead of bucket_size - 8.
    . Remove __GFP_ZERO flag from __alloc_percpu_gfp() call.
  v1 -> v2:
    . Avoid unnecessary extra percpu memory allocation.
    . Add a separate function to do bpf_global_percpu_ma initialization
    . promote.
    . Promote function static 'sizes' array to file static.
    . Add comments to explain to refill only one item for percpu alloc.
====================

Link: https://lore.kernel.org/r/20231222031729.1287957-1-yonghong.song@linux.devSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 417fa6d1 adc8c454
......@@ -11,6 +11,7 @@ struct bpf_mem_caches;
struct bpf_mem_alloc {
struct bpf_mem_caches __percpu *caches;
struct bpf_mem_cache __percpu *cache;
struct obj_cgroup *objcg;
bool percpu;
struct work_struct work;
};
......@@ -21,8 +22,15 @@ struct bpf_mem_alloc {
* 'size = 0' is for bpf_mem_alloc which manages many fixed-size objects.
* Alloc and free are done with bpf_mem_{alloc,free}() and the size of
* the returned object is given by the size argument of bpf_mem_alloc().
* If percpu equals true, error will be returned in order to avoid
* large memory consumption and the below bpf_mem_alloc_percpu_unit_init()
* should be used to do on-demand per-cpu allocation for each size.
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu);
/* Initialize a non-fix-size percpu memory allocator */
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg);
/* The percpu allocation with a specific unit size. */
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size);
void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma);
/* kmalloc/kfree equivalent: */
......
......@@ -121,6 +121,8 @@ struct bpf_mem_caches {
struct bpf_mem_cache cache[NUM_CACHES];
};
static const u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
static struct llist_node notrace *__llist_del_first(struct llist_head *head)
{
struct llist_node *entry, *next;
......@@ -462,11 +464,17 @@ static void notrace irq_work_raise(struct bpf_mem_cache *c)
* consume ~ 11 Kbyte per cpu.
* Typical case will be between 11K and 116K closer to 11K.
* bpf progs can and should share bpf_mem_cache when possible.
*
* Percpu allocation is typically rare. To avoid potential unnecessary large
* memory consumption, set low_mark = 1 and high_mark = 3, resulting in c->batch = 1.
*/
static void init_refill_work(struct bpf_mem_cache *c)
{
init_irq_work(&c->refill_work, bpf_mem_refill);
if (c->unit_size <= 256) {
if (c->percpu_size) {
c->low_watermark = 1;
c->high_watermark = 3;
} else if (c->unit_size <= 256) {
c->low_watermark = 32;
c->high_watermark = 96;
} else {
......@@ -483,11 +491,16 @@ static void init_refill_work(struct bpf_mem_cache *c)
static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
{
/* To avoid consuming memory assume that 1st run of bpf
* prog won't be doing more than 4 map_update_elem from
* irq disabled region
int cnt = 1;
/* To avoid consuming memory, for non-percpu allocation, assume that
* 1st run of bpf prog won't be doing more than 4 map_update_elem from
* irq disabled region if unit size is less than or equal to 256.
* For all other cases, let us just do one allocation.
*/
alloc_bulk(c, c->unit_size <= 256 ? 4 : 1, cpu_to_node(cpu), false);
if (!c->percpu_size && c->unit_size <= 256)
cnt = 4;
alloc_bulk(c, cnt, cpu_to_node(cpu), false);
}
/* When size != 0 bpf_mem_cache for each cpu.
......@@ -499,12 +512,14 @@ static void prefill_mem_cache(struct bpf_mem_cache *c, int cpu)
*/
int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
{
static u16 sizes[NUM_CACHES] = {96, 192, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
struct bpf_mem_caches *cc, __percpu *pcc;
struct bpf_mem_cache *c, __percpu *pc;
struct obj_cgroup *objcg = NULL;
int cpu, i, unit_size, percpu_size = 0;
if (percpu && size == 0)
return -EINVAL;
/* room for llist_node and per-cpu pointer */
if (percpu)
percpu_size = LLIST_NODE_SZ + sizeof(void *);
......@@ -523,6 +538,8 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
if (memcg_bpf_enabled())
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;
for_each_possible_cpu(cpu) {
c = per_cpu_ptr(pc, cpu);
c->unit_size = unit_size;
......@@ -542,6 +559,7 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
#ifdef CONFIG_MEMCG_KMEM
objcg = get_obj_cgroup_from_current();
#endif
ma->objcg = objcg;
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(pcc, cpu);
for (i = 0; i < NUM_CACHES; i++) {
......@@ -560,6 +578,56 @@ int bpf_mem_alloc_init(struct bpf_mem_alloc *ma, int size, bool percpu)
return 0;
}
int bpf_mem_alloc_percpu_init(struct bpf_mem_alloc *ma, struct obj_cgroup *objcg)
{
struct bpf_mem_caches __percpu *pcc;
pcc = __alloc_percpu_gfp(sizeof(struct bpf_mem_caches), 8, GFP_KERNEL);
if (!pcc)
return -ENOMEM;
ma->caches = pcc;
ma->objcg = objcg;
ma->percpu = true;
return 0;
}
int bpf_mem_alloc_percpu_unit_init(struct bpf_mem_alloc *ma, int size)
{
struct bpf_mem_caches *cc, __percpu *pcc;
int cpu, i, unit_size, percpu_size;
struct obj_cgroup *objcg;
struct bpf_mem_cache *c;
i = bpf_mem_cache_idx(size);
if (i < 0)
return -EINVAL;
/* room for llist_node and per-cpu pointer */
percpu_size = LLIST_NODE_SZ + sizeof(void *);
unit_size = sizes[i];
objcg = ma->objcg;
pcc = ma->caches;
for_each_possible_cpu(cpu) {
cc = per_cpu_ptr(pcc, cpu);
c = &cc->cache[i];
if (cpu == 0 && c->unit_size)
break;
c->unit_size = unit_size;
c->objcg = objcg;
c->percpu_size = percpu_size;
c->tgt = c;
init_refill_work(c);
prefill_mem_cache(c, cpu);
}
return 0;
}
static void drain_mem_cache(struct bpf_mem_cache *c)
{
bool percpu = !!c->percpu_size;
......@@ -691,9 +759,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_ttrace_in_progress);
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
/* objcg is the same across cpus */
if (c->objcg)
obj_cgroup_put(c->objcg);
if (ma->objcg)
obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
if (ma->caches) {
......@@ -709,8 +776,8 @@ void bpf_mem_alloc_destroy(struct bpf_mem_alloc *ma)
rcu_in_progress += atomic_read(&c->call_rcu_in_progress);
}
}
if (c->objcg)
obj_cgroup_put(c->objcg);
if (ma->objcg)
obj_cgroup_put(ma->objcg);
destroy_mem_alloc(ma, rcu_in_progress);
}
}
......@@ -833,7 +900,9 @@ void notrace *bpf_mem_alloc(struct bpf_mem_alloc *ma, size_t size)
if (!size)
return NULL;
idx = bpf_mem_cache_idx(size + LLIST_NODE_SZ);
if (!ma->percpu)
size += LLIST_NODE_SZ;
idx = bpf_mem_cache_idx(size);
if (idx < 0)
return NULL;
......
......@@ -195,6 +195,8 @@ struct bpf_verifier_stack_elem {
POISON_POINTER_DELTA))
#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV))
#define BPF_GLOBAL_PERCPU_MA_MAX_SIZE 512
static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx);
static int release_reference(struct bpf_verifier_env *env, int ref_obj_id);
static void invalidate_non_owning_refs(struct bpf_verifier_env *env);
......@@ -12139,20 +12141,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
if (meta.func_id == special_kfunc_list[KF_bpf_obj_new_impl] && !bpf_global_ma_set)
return -ENOMEM;
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
if (!bpf_global_percpu_ma_set) {
mutex_lock(&bpf_percpu_ma_lock);
if (!bpf_global_percpu_ma_set) {
err = bpf_mem_alloc_init(&bpf_global_percpu_ma, 0, true);
if (!err)
bpf_global_percpu_ma_set = true;
}
mutex_unlock(&bpf_percpu_ma_lock);
if (err)
return err;
}
}
if (((u64)(u32)meta.arg_constant.value) != meta.arg_constant.value) {
verbose(env, "local type ID argument must be in range [0, U32_MAX]\n");
return -EINVAL;
......@@ -12173,6 +12161,35 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
if (ret_t->size > BPF_GLOBAL_PERCPU_MA_MAX_SIZE) {
verbose(env, "bpf_percpu_obj_new type size (%d) is greater than %d\n",
ret_t->size, BPF_GLOBAL_PERCPU_MA_MAX_SIZE);
return -EINVAL;
}
if (!bpf_global_percpu_ma_set) {
mutex_lock(&bpf_percpu_ma_lock);
if (!bpf_global_percpu_ma_set) {
/* Charge memory allocated with bpf_global_percpu_ma to
* root memcg. The obj_cgroup for root memcg is NULL.
*/
err = bpf_mem_alloc_percpu_init(&bpf_global_percpu_ma, NULL);
if (!err)
bpf_global_percpu_ma_set = true;
}
mutex_unlock(&bpf_percpu_ma_lock);
if (err)
return err;
}
mutex_lock(&bpf_percpu_ma_lock);
err = bpf_mem_alloc_percpu_unit_init(&bpf_global_percpu_ma, ret_t->size);
mutex_unlock(&bpf_percpu_ma_lock);
if (err)
return err;
}
struct_meta = btf_find_struct_meta(ret_btf, ret_btf_id);
if (meta.func_id == special_kfunc_list[KF_bpf_percpu_obj_new_impl]) {
if (!__btf_type_is_scalar_struct(env, ret_btf, ret_t, 0)) {
......
......@@ -14,7 +14,8 @@ static void do_bpf_ma_test(const char *name)
struct test_bpf_ma *skel;
struct bpf_program *prog;
struct btf *btf;
int i, err;
int i, err, id;
char tname[32];
skel = test_bpf_ma__open();
if (!ASSERT_OK_PTR(skel, "open"))
......@@ -25,16 +26,21 @@ static void do_bpf_ma_test(const char *name)
goto out;
for (i = 0; i < ARRAY_SIZE(skel->rodata->data_sizes); i++) {
char name[32];
int id;
snprintf(name, sizeof(name), "bin_data_%u", skel->rodata->data_sizes[i]);
id = btf__find_by_name_kind(btf, name, BTF_KIND_STRUCT);
if (!ASSERT_GT(id, 0, "bin_data"))
snprintf(tname, sizeof(tname), "bin_data_%u", skel->rodata->data_sizes[i]);
id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
if (!ASSERT_GT(id, 0, tname))
goto out;
skel->rodata->data_btf_ids[i] = id;
}
for (i = 0; i < ARRAY_SIZE(skel->rodata->percpu_data_sizes); i++) {
snprintf(tname, sizeof(tname), "percpu_bin_data_%u", skel->rodata->percpu_data_sizes[i]);
id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT);
if (!ASSERT_GT(id, 0, tname))
goto out;
skel->rodata->percpu_data_btf_ids[i] = id;
}
prog = bpf_object__find_program_by_name(skel->obj, name);
if (!ASSERT_OK_PTR(prog, "invalid prog name"))
goto out;
......
......@@ -17,6 +17,10 @@ struct val_with_rb_root_t {
struct bpf_spin_lock lock;
};
struct val_600b_t {
char b[600];
};
struct elem {
long sum;
struct val_t __percpu_kptr *pc;
......@@ -161,4 +165,18 @@ int BPF_PROG(test_array_map_7)
return 0;
}
SEC("?fentry.s/bpf_fentry_test1")
__failure __msg("bpf_percpu_obj_new type size (600) is greater than 512")
int BPF_PROG(test_array_map_8)
{
struct val_600b_t __percpu_kptr *p;
p = bpf_percpu_obj_new(struct val_600b_t);
if (!p)
return 0;
bpf_percpu_obj_drop(p);
return 0;
}
char _license[] SEC("license") = "GPL";
......@@ -20,6 +20,9 @@ char _license[] SEC("license") = "GPL";
const unsigned int data_sizes[] = {16, 32, 64, 96, 128, 192, 256, 512, 1024, 2048, 4096};
const volatile unsigned int data_btf_ids[ARRAY_SIZE(data_sizes)] = {};
const unsigned int percpu_data_sizes[] = {8, 16, 32, 64, 96, 128, 192, 256, 512};
const volatile unsigned int percpu_data_btf_ids[ARRAY_SIZE(data_sizes)] = {};
int err = 0;
u32 pid = 0;
......@@ -27,10 +30,10 @@ u32 pid = 0;
struct bin_data_##_size { \
char data[_size - sizeof(void *)]; \
}; \
/* See Commit 5d8d6634ccc, force btf generation for type bin_data_##_size */ \
struct bin_data_##_size *__bin_data_##_size; \
struct map_value_##_size { \
struct bin_data_##_size __kptr * data; \
/* To emit BTF info for bin_data_xx */ \
struct bin_data_##_size not_used; \
}; \
struct { \
__uint(type, BPF_MAP_TYPE_ARRAY); \
......@@ -40,8 +43,12 @@ u32 pid = 0;
} array_##_size SEC(".maps")
#define DEFINE_ARRAY_WITH_PERCPU_KPTR(_size) \
struct percpu_bin_data_##_size { \
char data[_size]; \
}; \
struct percpu_bin_data_##_size *__percpu_bin_data_##_size; \
struct map_value_percpu_##_size { \
struct bin_data_##_size __percpu_kptr * data; \
struct percpu_bin_data_##_size __percpu_kptr * data; \
}; \
struct { \
__uint(type, BPF_MAP_TYPE_ARRAY); \
......@@ -114,7 +121,7 @@ static __always_inline void batch_percpu_alloc(struct bpf_map *map, unsigned int
return;
}
/* per-cpu allocator may not be able to refill in time */
new = bpf_percpu_obj_new_impl(data_btf_ids[idx], NULL);
new = bpf_percpu_obj_new_impl(percpu_data_btf_ids[idx], NULL);
if (!new)
continue;
......@@ -179,7 +186,7 @@ DEFINE_ARRAY_WITH_KPTR(1024);
DEFINE_ARRAY_WITH_KPTR(2048);
DEFINE_ARRAY_WITH_KPTR(4096);
/* per-cpu kptr doesn't support bin_data_8 which is a zero-sized array */
DEFINE_ARRAY_WITH_PERCPU_KPTR(8);
DEFINE_ARRAY_WITH_PERCPU_KPTR(16);
DEFINE_ARRAY_WITH_PERCPU_KPTR(32);
DEFINE_ARRAY_WITH_PERCPU_KPTR(64);
......@@ -188,9 +195,6 @@ DEFINE_ARRAY_WITH_PERCPU_KPTR(128);
DEFINE_ARRAY_WITH_PERCPU_KPTR(192);
DEFINE_ARRAY_WITH_PERCPU_KPTR(256);
DEFINE_ARRAY_WITH_PERCPU_KPTR(512);
DEFINE_ARRAY_WITH_PERCPU_KPTR(1024);
DEFINE_ARRAY_WITH_PERCPU_KPTR(2048);
DEFINE_ARRAY_WITH_PERCPU_KPTR(4096);
SEC("?fentry/" SYS_PREFIX "sys_nanosleep")
int test_batch_alloc_free(void *ctx)
......@@ -246,20 +250,18 @@ int test_batch_percpu_alloc_free(void *ctx)
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
/* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling,
* then free 128 16-bytes per-cpu objects in batch to trigger freeing.
/* Alloc 128 8-bytes per-cpu objects in batch to trigger refilling,
* then free 128 8-bytes per-cpu objects in batch to trigger freeing.
*/
CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 0);
CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 1);
CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 2);
CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 3);
CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 4);
CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 5);
CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 6);
CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 7);
CALL_BATCH_PERCPU_ALLOC_FREE(1024, 32, 8);
CALL_BATCH_PERCPU_ALLOC_FREE(2048, 16, 9);
CALL_BATCH_PERCPU_ALLOC_FREE(4096, 8, 10);
CALL_BATCH_PERCPU_ALLOC_FREE(8, 128, 0);
CALL_BATCH_PERCPU_ALLOC_FREE(16, 128, 1);
CALL_BATCH_PERCPU_ALLOC_FREE(32, 128, 2);
CALL_BATCH_PERCPU_ALLOC_FREE(64, 128, 3);
CALL_BATCH_PERCPU_ALLOC_FREE(96, 128, 4);
CALL_BATCH_PERCPU_ALLOC_FREE(128, 128, 5);
CALL_BATCH_PERCPU_ALLOC_FREE(192, 128, 6);
CALL_BATCH_PERCPU_ALLOC_FREE(256, 128, 7);
CALL_BATCH_PERCPU_ALLOC_FREE(512, 64, 8);
return 0;
}
......@@ -270,20 +272,18 @@ int test_percpu_free_through_map_free(void *ctx)
if ((u32)bpf_get_current_pid_tgid() != pid)
return 0;
/* Alloc 128 16-bytes per-cpu objects in batch to trigger refilling,
/* Alloc 128 8-bytes per-cpu objects in batch to trigger refilling,
* then free these object through map free.
*/
CALL_BATCH_PERCPU_ALLOC(16, 128, 0);
CALL_BATCH_PERCPU_ALLOC(32, 128, 1);
CALL_BATCH_PERCPU_ALLOC(64, 128, 2);
CALL_BATCH_PERCPU_ALLOC(96, 128, 3);
CALL_BATCH_PERCPU_ALLOC(128, 128, 4);
CALL_BATCH_PERCPU_ALLOC(192, 128, 5);
CALL_BATCH_PERCPU_ALLOC(256, 128, 6);
CALL_BATCH_PERCPU_ALLOC(512, 64, 7);
CALL_BATCH_PERCPU_ALLOC(1024, 32, 8);
CALL_BATCH_PERCPU_ALLOC(2048, 16, 9);
CALL_BATCH_PERCPU_ALLOC(4096, 8, 10);
CALL_BATCH_PERCPU_ALLOC(8, 128, 0);
CALL_BATCH_PERCPU_ALLOC(16, 128, 1);
CALL_BATCH_PERCPU_ALLOC(32, 128, 2);
CALL_BATCH_PERCPU_ALLOC(64, 128, 3);
CALL_BATCH_PERCPU_ALLOC(96, 128, 4);
CALL_BATCH_PERCPU_ALLOC(128, 128, 5);
CALL_BATCH_PERCPU_ALLOC(192, 128, 6);
CALL_BATCH_PERCPU_ALLOC(256, 128, 7);
CALL_BATCH_PERCPU_ALLOC(512, 64, 8);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment