Commit a819ff0c authored by Paolo Abeni's avatar Paolo Abeni

Merge tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf

Daniel Borkmann says:

====================
pull-request: bpf 2024-07-11

The following pull-request contains BPF updates for your *net* tree.

We've added 4 non-merge commits during the last 2 day(s) which contain
a total of 4 files changed, 262 insertions(+), 19 deletions(-).

The main changes are:

1) Fixes for a BPF timer lockup and a use-after-free scenario when timers
   are used concurrently, from Kumar Kartikeya Dwivedi.

2) Fix the argument order in the call to bpf_map_kvcalloc() which could
   otherwise lead to a compilation error, from Mohammad Shehar Yaar Tausif.

bpf-for-netdev

* tag 'for-netdev' of https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf:
  selftests/bpf: Add timer lockup selftest
  bpf: Defer work in bpf_timer_cancel_and_free
  bpf: Fail bpf_timer_cancel when callback is being cancelled
  bpf: fix order of args in call to bpf_map_kvcalloc
====================

Link: https://patch.msgid.link/20240711084016.25757-1-daniel@iogearbox.netSigned-off-by: default avatarPaolo Abeni <pabeni@redhat.com>
parents 626dfed5 50bd5a0c
......@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
nbuckets = max_t(u32, 2, nbuckets);
smap->bucket_log = ilog2(nbuckets);
smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
nbuckets, GFP_USER | __GFP_NOWARN);
smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets,
sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
err = -ENOMEM;
goto free_smap;
......
......@@ -1084,7 +1084,10 @@ struct bpf_async_cb {
struct bpf_prog *prog;
void __rcu *callback_fn;
void *value;
union {
struct rcu_head rcu;
struct work_struct delete_work;
};
u64 flags;
};
......@@ -1107,6 +1110,7 @@ struct bpf_async_cb {
struct bpf_hrtimer {
struct bpf_async_cb cb;
struct hrtimer timer;
atomic_t cancelling;
};
struct bpf_work {
......@@ -1219,6 +1223,21 @@ static void bpf_wq_delete_work(struct work_struct *work)
kfree_rcu(w, cb.rcu);
}
static void bpf_timer_delete_work(struct work_struct *work)
{
struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, cb.delete_work);
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call
* kfree_rcu(t) right after for both preallocated and non-preallocated
* maps. The async->cb = NULL was already done and no code path can see
* address 't' anymore. Timer if armed for existing bpf_hrtimer before
* bpf_timer_cancel_and_free will have been cancelled.
*/
hrtimer_cancel(&t->timer);
kfree_rcu(t, cb.rcu);
}
static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u64 flags,
enum bpf_async_type type)
{
......@@ -1262,6 +1281,8 @@ static int __bpf_async_init(struct bpf_async_kern *async, struct bpf_map *map, u
clockid = flags & (MAX_CLOCKS - 1);
t = (struct bpf_hrtimer *)cb;
atomic_set(&t->cancelling, 0);
INIT_WORK(&t->cb.delete_work, bpf_timer_delete_work);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
t->timer.function = bpf_timer_cb;
cb->value = (void *)async - map->record->timer_off;
......@@ -1440,7 +1461,8 @@ static void drop_prog_refcnt(struct bpf_async_cb *async)
BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
{
struct bpf_hrtimer *t;
struct bpf_hrtimer *t, *cur_t;
bool inc = false;
int ret = 0;
if (in_nmi())
......@@ -1452,14 +1474,41 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
ret = -EINVAL;
goto out;
}
if (this_cpu_read(hrtimer_running) == t) {
cur_t = this_cpu_read(hrtimer_running);
if (cur_t == t) {
/* If bpf callback_fn is trying to bpf_timer_cancel()
* its own timer the hrtimer_cancel() will deadlock
* since it waits for callback_fn to finish
* since it waits for callback_fn to finish.
*/
ret = -EDEADLK;
goto out;
}
/* Only account in-flight cancellations when invoked from a timer
* callback, since we want to avoid waiting only if other _callbacks_
* are waiting on us, to avoid introducing lockups. Non-callback paths
* are ok, since nobody would synchronously wait for their completion.
*/
if (!cur_t)
goto drop;
atomic_inc(&t->cancelling);
/* Need full barrier after relaxed atomic_inc */
smp_mb__after_atomic();
inc = true;
if (atomic_read(&cur_t->cancelling)) {
/* We're cancelling timer t, while some other timer callback is
* attempting to cancel us. In such a case, it might be possible
* that timer t belongs to the other callback, or some other
* callback waiting upon it (creating transitive dependencies
* upon us), and we will enter a deadlock if we continue
* cancelling and waiting for it synchronously, since it might
* do the same. Bail!
*/
ret = -EDEADLK;
goto out;
}
drop:
drop_prog_refcnt(&t->cb);
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
......@@ -1467,6 +1516,8 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_async_kern *, timer)
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
if (inc)
atomic_dec(&t->cancelling);
rcu_read_unlock();
return ret;
}
......@@ -1512,25 +1563,39 @@ void bpf_timer_cancel_and_free(void *val)
if (!t)
return;
/* Cancel the timer and wait for callback to complete if it was running.
* If hrtimer_cancel() can be safely called it's safe to call kfree(t)
* right after for both preallocated and non-preallocated maps.
* The async->cb = NULL was already done and no code path can
* see address 't' anymore.
*
* Check that bpf_map_delete/update_elem() wasn't called from timer
* callback_fn. In such case don't call hrtimer_cancel() (since it will
* deadlock) and don't call hrtimer_try_to_cancel() (since it will just
* return -1). Though callback_fn is still running on this cpu it's
/* We check that bpf_map_delete/update_elem() was called from timer
* callback_fn. In such case we don't call hrtimer_cancel() (since it
* will deadlock) and don't call hrtimer_try_to_cancel() (since it will
* just return -1). Though callback_fn is still running on this cpu it's
* safe to do kfree(t) because bpf_timer_cb() read everything it needed
* from 't'. The bpf subprog callback_fn won't be able to access 't',
* since async->cb = NULL was already done. The timer will be
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
*
* However, it is possible the timer callback_fn calling us armed the
* timer _before_ calling us, such that failing to cancel it here will
* cause it to possibly use struct hrtimer after freeing bpf_hrtimer.
* Therefore, we _need_ to cancel any outstanding timers before we do
* kfree_rcu, even though no more timers can be armed.
*
* Moreover, we need to schedule work even if timer does not belong to
* the calling callback_fn, as on two different CPUs, we can end up in a
* situation where both sides run in parallel, try to cancel one
* another, and we end up waiting on both sides in hrtimer_cancel
* without making forward progress, since timer1 depends on time2
* callback to finish, and vice versa.
*
* CPU 1 (timer1_cb) CPU 2 (timer2_cb)
* bpf_timer_cancel_and_free(timer2) bpf_timer_cancel_and_free(timer1)
*
* To avoid these issues, punt to workqueue context when we are in a
* timer callback.
*/
if (this_cpu_read(hrtimer_running) != t)
hrtimer_cancel(&t->timer);
kfree_rcu(t, cb.rcu);
if (this_cpu_read(hrtimer_running))
queue_work(system_unbound_wq, &t->cb.delete_work);
else
bpf_timer_delete_work(&t->cb.delete_work);
}
/* This function is called by map_delete/update_elem for individual element and
......
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <sched.h>
#include <test_progs.h>
#include <pthread.h>
#include <network_helpers.h>
#include "timer_lockup.skel.h"
static long cpu;
static int *timer1_err;
static int *timer2_err;
static bool skip;
volatile int k = 0;
static void *timer_lockup_thread(void *arg)
{
LIBBPF_OPTS(bpf_test_run_opts, opts,
.data_in = &pkt_v4,
.data_size_in = sizeof(pkt_v4),
.repeat = 1000,
);
int i, prog_fd = *(int *)arg;
cpu_set_t cpuset;
CPU_ZERO(&cpuset);
CPU_SET(__sync_fetch_and_add(&cpu, 1), &cpuset);
ASSERT_OK(pthread_setaffinity_np(pthread_self(), sizeof(cpuset),
&cpuset),
"cpu affinity");
for (i = 0; !READ_ONCE(*timer1_err) && !READ_ONCE(*timer2_err); i++) {
bpf_prog_test_run_opts(prog_fd, &opts);
/* Skip the test if we can't reproduce the race in a reasonable
* amount of time.
*/
if (i > 50) {
WRITE_ONCE(skip, true);
break;
}
}
return NULL;
}
void test_timer_lockup(void)
{
int timer1_prog, timer2_prog;
struct timer_lockup *skel;
pthread_t thrds[2];
void *ret;
skel = timer_lockup__open_and_load();
if (!ASSERT_OK_PTR(skel, "timer_lockup__open_and_load"))
return;
timer1_prog = bpf_program__fd(skel->progs.timer1_prog);
timer2_prog = bpf_program__fd(skel->progs.timer2_prog);
timer1_err = &skel->bss->timer1_err;
timer2_err = &skel->bss->timer2_err;
if (!ASSERT_OK(pthread_create(&thrds[0], NULL, timer_lockup_thread,
&timer1_prog),
"pthread_create thread1"))
goto out;
if (!ASSERT_OK(pthread_create(&thrds[1], NULL, timer_lockup_thread,
&timer2_prog),
"pthread_create thread2")) {
pthread_exit(&thrds[0]);
goto out;
}
pthread_join(thrds[1], &ret);
pthread_join(thrds[0], &ret);
if (skip) {
test__skip();
goto out;
}
if (*timer1_err != -EDEADLK && *timer1_err != 0)
ASSERT_FAIL("timer1_err bad value");
if (*timer2_err != -EDEADLK && *timer2_err != 0)
ASSERT_FAIL("timer2_err bad value");
out:
timer_lockup__destroy(skel);
return;
}
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <time.h>
#include <errno.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include "bpf_misc.h"
char _license[] SEC("license") = "GPL";
struct elem {
struct bpf_timer t;
};
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, int);
__type(value, struct elem);
} timer1_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY);
__uint(max_entries, 1);
__type(key, int);
__type(value, struct elem);
} timer2_map SEC(".maps");
int timer1_err;
int timer2_err;
static int timer_cb1(void *map, int *k, struct elem *v)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer2_map, &key);
if (timer)
timer2_err = bpf_timer_cancel(timer);
return 0;
}
static int timer_cb2(void *map, int *k, struct elem *v)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer1_map, &key);
if (timer)
timer1_err = bpf_timer_cancel(timer);
return 0;
}
SEC("tc")
int timer1_prog(void *ctx)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer1_map, &key);
if (timer) {
bpf_timer_init(timer, &timer1_map, CLOCK_BOOTTIME);
bpf_timer_set_callback(timer, timer_cb1);
bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
}
return 0;
}
SEC("tc")
int timer2_prog(void *ctx)
{
struct bpf_timer *timer;
int key = 0;
timer = bpf_map_lookup_elem(&timer2_map, &key);
if (timer) {
bpf_timer_init(timer, &timer2_map, CLOCK_BOOTTIME);
bpf_timer_set_callback(timer, timer_cb2);
bpf_timer_start(timer, 1, BPF_F_TIMER_CPU_PIN);
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment