Commit 35897c3c authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf-enable-irq-after-irq_work_raise-completes'

Hou Tao says:

====================
bpf: Enable IRQ after irq_work_raise() completes

From: Hou Tao <houtao1@huawei.com>

Hi,

The patchset aims to fix the problem that bpf_mem_alloc() may return
NULL unexpectedly when multiple bpf_mem_alloc() are invoked concurrently
under process context and there is still free memory available. The
problem was found when doing stress test for qp-trie but the same
problem also exists for bpf_obj_new() as demonstrated in patch #3.

As pointed out by Alexei, the patchset can only fix ENOMEM problem for
normal process context and can not fix the problem for irq-disabled
context or RT-enabled kernel.

Patch #1 fixes the race between unit_alloc() and unit_alloc(). Patch #2
fixes the race between unit_alloc() and unit_free(). And patch #3 adds
a selftest for the problem. The major change compared with v1 is using
local_irq_{save,restore)() pair to disable and enable preemption
instead of preempt_{disable,enable}_notrace pair. The main reason is to
prevent potential overhead from __preempt_schedule_notrace(). I also
run htab_mem benchmark and hash_map_perf on a 8-CPUs KVM VM to compare
the performance between local_irq_{save,restore} and
preempt_{disable,enable}_notrace(), but the results are similar as shown
below:

(1) use preempt_{disable,enable}_notrace()

[root@hello bpf]# ./map_perf_test 4 8
0:hash_map_perf kmalloc 652179 events per sec
1:hash_map_perf kmalloc 651880 events per sec
2:hash_map_perf kmalloc 651382 events per sec
3:hash_map_perf kmalloc 650791 events per sec
5:hash_map_perf kmalloc 650140 events per sec
6:hash_map_perf kmalloc 652773 events per sec
7:hash_map_perf kmalloc 652751 events per sec
4:hash_map_perf kmalloc 648199 events per sec

[root@hello bpf]# ./benchs/run_bench_htab_mem.sh
normal bpf ma
=============
overwrite            per-prod-op: 110.82 ± 0.02k/s, avg mem: 2.00 ± 0.00MiB, peak mem: 2.73MiB
batch_add_batch_del  per-prod-op: 89.79 ± 0.75k/s, avg mem: 1.68 ± 0.38MiB, peak mem: 2.73MiB
add_del_on_diff_cpu  per-prod-op: 17.83 ± 0.07k/s, avg mem: 25.68 ± 2.92MiB, peak mem: 35.10MiB

(2) use local_irq_{save,restore}

[root@hello bpf]# ./map_perf_test 4 8
0:hash_map_perf kmalloc 656299 events per sec
1:hash_map_perf kmalloc 656397 events per sec
2:hash_map_perf kmalloc 656046 events per sec
3:hash_map_perf kmalloc 655723 events per sec
5:hash_map_perf kmalloc 655221 events per sec
4:hash_map_perf kmalloc 654617 events per sec
6:hash_map_perf kmalloc 650269 events per sec
7:hash_map_perf kmalloc 653665 events per sec

[root@hello bpf]# ./benchs/run_bench_htab_mem.sh
normal bpf ma
=============
overwrite            per-prod-op: 116.10 ± 0.02k/s, avg mem: 2.00 ± 0.00MiB, peak mem: 2.74MiB
batch_add_batch_del  per-prod-op: 88.76 ± 0.61k/s, avg mem: 1.94 ± 0.33MiB, peak mem: 2.74MiB
add_del_on_diff_cpu  per-prod-op: 18.12 ± 0.08k/s, avg mem: 25.10 ± 2.70MiB, peak mem: 34.78MiB

As ususal comments are always welcome.

Change Log:
v2:
  * Use local_irq_save to disable preemption instead of using
    preempt_{disable,enable}_notrace pair to prevent potential overhead

v1: https://lore.kernel.org/bpf/20230822133807.3198625-1-houtao@huaweicloud.com/
====================

Link: https://lore.kernel.org/r/20230901111954.1804721-1-houtao@huaweicloud.comSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 1e4a6d97 29c11aa8
...@@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c) ...@@ -732,12 +732,17 @@ static void notrace *unit_alloc(struct bpf_mem_cache *c)
} }
} }
local_dec(&c->active); local_dec(&c->active);
local_irq_restore(flags);
WARN_ON(cnt < 0); WARN_ON(cnt < 0);
if (cnt < c->low_watermark) if (cnt < c->low_watermark)
irq_work_raise(c); irq_work_raise(c);
/* Enable IRQ after the enqueue of irq work completes, so irq work
* will run after IRQ is enabled and free_llist may be refilled by
* irq work before other task preempts current task.
*/
local_irq_restore(flags);
return llnode; return llnode;
} }
...@@ -773,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr) ...@@ -773,11 +778,16 @@ static void notrace unit_free(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra); llist_add(llnode, &c->free_llist_extra);
} }
local_dec(&c->active); local_dec(&c->active);
local_irq_restore(flags);
if (cnt > c->high_watermark) if (cnt > c->high_watermark)
/* free few objects from current cpu into global kmalloc pool */ /* free few objects from current cpu into global kmalloc pool */
irq_work_raise(c); irq_work_raise(c);
/* Enable IRQ after irq_work_raise() completes, otherwise when current
* task is preempted by task which does unit_alloc(), unit_alloc() may
* return NULL unexpectedly because irq work is already pending but can
* not been triggered and free_llist can not be refilled timely.
*/
local_irq_restore(flags);
} }
static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
...@@ -795,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr) ...@@ -795,10 +805,10 @@ static void notrace unit_free_rcu(struct bpf_mem_cache *c, void *ptr)
llist_add(llnode, &c->free_llist_extra_rcu); llist_add(llnode, &c->free_llist_extra_rcu);
} }
local_dec(&c->active); local_dec(&c->active);
local_irq_restore(flags);
if (!atomic_read(&c->call_rcu_in_progress)) if (!atomic_read(&c->call_rcu_in_progress))
irq_work_raise(c); irq_work_raise(c);
local_irq_restore(flags);
} }
/* Called from BPF program or from sys_bpf syscall. /* Called from BPF program or from sys_bpf syscall.
......
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
#define _GNU_SOURCE
#include <sched.h>
#include <pthread.h>
#include <stdbool.h>
#include <test_progs.h>
#include "preempted_bpf_ma_op.skel.h"
#define ALLOC_THREAD_NR 4
#define ALLOC_LOOP_NR 512
struct alloc_ctx {
/* output */
int run_err;
/* input */
int fd;
bool *nomem_err;
};
static void *run_alloc_prog(void *data)
{
struct alloc_ctx *ctx = data;
cpu_set_t cpu_set;
int i;
CPU_ZERO(&cpu_set);
CPU_SET(0, &cpu_set);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
for (i = 0; i < ALLOC_LOOP_NR && !*ctx->nomem_err; i++) {
LIBBPF_OPTS(bpf_test_run_opts, topts);
int err;
err = bpf_prog_test_run_opts(ctx->fd, &topts);
ctx->run_err |= err | topts.retval;
}
return NULL;
}
void test_preempted_bpf_ma_op(void)
{
struct alloc_ctx ctx[ALLOC_THREAD_NR];
struct preempted_bpf_ma_op *skel;
pthread_t tid[ALLOC_THREAD_NR];
int i, err;
skel = preempted_bpf_ma_op__open_and_load();
if (!ASSERT_OK_PTR(skel, "open_and_load"))
return;
err = preempted_bpf_ma_op__attach(skel);
if (!ASSERT_OK(err, "attach"))
goto out;
for (i = 0; i < ARRAY_SIZE(ctx); i++) {
struct bpf_program *prog;
char name[8];
snprintf(name, sizeof(name), "test%d", i);
prog = bpf_object__find_program_by_name(skel->obj, name);
if (!ASSERT_OK_PTR(prog, "no test prog"))
goto out;
ctx[i].run_err = 0;
ctx[i].fd = bpf_program__fd(prog);
ctx[i].nomem_err = &skel->bss->nomem_err;
}
memset(tid, 0, sizeof(tid));
for (i = 0; i < ARRAY_SIZE(tid); i++) {
err = pthread_create(&tid[i], NULL, run_alloc_prog, &ctx[i]);
if (!ASSERT_OK(err, "pthread_create"))
break;
}
for (i = 0; i < ARRAY_SIZE(tid); i++) {
if (!tid[i])
break;
pthread_join(tid[i], NULL);
ASSERT_EQ(ctx[i].run_err, 0, "run prog err");
}
ASSERT_FALSE(skel->bss->nomem_err, "ENOMEM");
out:
preempted_bpf_ma_op__destroy(skel);
}
// SPDX-License-Identifier: GPL-2.0
/* Copyright (C) 2023. Huawei Technologies Co., Ltd */
#include <vmlinux.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_helpers.h>
#include "bpf_experimental.h"
struct bin_data {
char data[256];
struct bpf_spin_lock lock;
};
struct map_value {
struct bin_data __kptr * data;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__type(key, int);
__type(value, struct map_value);
__uint(max_entries, 2048);
} array SEC(".maps");
char _license[] SEC("license") = "GPL";
bool nomem_err = false;
static int del_array(unsigned int i, int *from)
{
struct map_value *value;
struct bin_data *old;
value = bpf_map_lookup_elem(&array, from);
if (!value)
return 1;
old = bpf_kptr_xchg(&value->data, NULL);
if (old)
bpf_obj_drop(old);
(*from)++;
return 0;
}
static int add_array(unsigned int i, int *from)
{
struct bin_data *old, *new;
struct map_value *value;
value = bpf_map_lookup_elem(&array, from);
if (!value)
return 1;
new = bpf_obj_new(typeof(*new));
if (!new) {
nomem_err = true;
return 1;
}
old = bpf_kptr_xchg(&value->data, new);
if (old)
bpf_obj_drop(old);
(*from)++;
return 0;
}
static void del_then_add_array(int from)
{
int i;
i = from;
bpf_loop(512, del_array, &i, 0);
i = from;
bpf_loop(512, add_array, &i, 0);
}
SEC("fentry/bpf_fentry_test1")
int BPF_PROG2(test0, int, a)
{
del_then_add_array(0);
return 0;
}
SEC("fentry/bpf_fentry_test2")
int BPF_PROG2(test1, int, a, u64, b)
{
del_then_add_array(512);
return 0;
}
SEC("fentry/bpf_fentry_test3")
int BPF_PROG2(test2, char, a, int, b, u64, c)
{
del_then_add_array(1024);
return 0;
}
SEC("fentry/bpf_fentry_test4")
int BPF_PROG2(test3, void *, a, char, b, int, c, u64, d)
{
del_then_add_array(1536);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment