Commit 80a836c2 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'BPF_and_RT'

Thomas Gleixner says:

====================
This is the third version of the BPF/RT patch set which makes both coexist
nicely. The long explanation can be found in the cover letter of the V1
submission:

  https://lore.kernel.org/r/20200214133917.304937432@linutronix.de

V2 is here:

  https://lore.kernel.org/r/20200220204517.863202864@linutronix.de

The following changes vs. V2 have been made:

  - Rebased to bpf-next, adjusted to the lock changes in the hashmap code.

  - Split the preallocation enforcement patch for instrumentation type BPF
    programs into two pieces:

    1) Emit a one-time warning on !RT kernels when any instrumentation type
       BPF program uses run-time allocation. Emit also a corresponding
       warning in the verifier log. But allow the program to run for
       backward compatibility sake. After a grace period this should be
       enforced.

    2) On RT reject such programs because on RT the memory allocator cannot
       be called from truly atomic contexts.

  - Fixed the fallout from V2 as reported by Alexei and 0-day

  - Removed the redundant preempt_disable() from trace_call_bpf()

  - Removed the unused export of trace_call_bpf()
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 8eece07c 099bfaa7
......@@ -885,7 +885,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *_prog; \
struct bpf_prog_array *_array; \
u32 _ret = 1; \
preempt_disable(); \
migrate_disable(); \
rcu_read_lock(); \
_array = rcu_dereference(array); \
if (unlikely(check_non_null && !_array))\
......@@ -898,7 +898,7 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array,
} \
_out: \
rcu_read_unlock(); \
preempt_enable(); \
migrate_enable(); \
_ret; \
})
......@@ -932,7 +932,7 @@ _out: \
u32 ret; \
u32 _ret = 1; \
u32 _cn = 0; \
preempt_disable(); \
migrate_disable(); \
rcu_read_lock(); \
_array = rcu_dereference(array); \
_item = &_array->items[0]; \
......@@ -944,7 +944,7 @@ _out: \
_item++; \
} \
rcu_read_unlock(); \
preempt_enable(); \
migrate_enable(); \
if (_ret) \
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
else \
......@@ -961,6 +961,36 @@ _out: \
#ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
/*
* Block execution of BPF programs attached to instrumentation (perf,
* kprobes, tracepoints) to prevent deadlocks on map operations as any of
* these events can happen inside a region which holds a map bucket lock
* and can deadlock on it.
*
* Use the preemption safe inc/dec variants on RT because migrate disable
* is preemptible on RT and preemption in the middle of the RMW operation
* might lead to inconsistent state. Use the raw variants for non RT
* kernels as migrate_disable() maps to preempt_disable() so the slightly
* more expensive save operation can be avoided.
*/
static inline void bpf_disable_instrumentation(void)
{
migrate_disable();
if (IS_ENABLED(CONFIG_PREEMPT_RT))
this_cpu_inc(bpf_prog_active);
else
__this_cpu_inc(bpf_prog_active);
}
static inline void bpf_enable_instrumentation(void)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
this_cpu_dec(bpf_prog_active);
else
__this_cpu_dec(bpf_prog_active);
migrate_enable();
}
extern const struct file_operations bpf_map_fops;
extern const struct file_operations bpf_prog_fops;
......
......@@ -561,7 +561,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
#define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \
u32 ret; \
cant_sleep(); \
cant_migrate(); \
if (static_branch_unlikely(&bpf_stats_enabled_key)) { \
struct bpf_prog_stats *stats; \
u64 start = sched_clock(); \
......@@ -576,8 +576,30 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
} \
ret; })
#define BPF_PROG_RUN(prog, ctx) __BPF_PROG_RUN(prog, ctx, \
bpf_dispatcher_nopfunc)
#define BPF_PROG_RUN(prog, ctx) \
__BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc)
/*
* Use in preemptible and therefore migratable context to make sure that
* the execution of the BPF program runs on one CPU.
*
* This uses migrate_disable/enable() explicitly to document that the
* invocation of a BPF program does not require reentrancy protection
* against a BPF program which is invoked from a preempting task.
*
* For non RT enabled kernels migrate_disable/enable() maps to
* preempt_disable/enable(), i.e. it disables also preemption.
*/
static inline u32 bpf_prog_run_pin_on_cpu(const struct bpf_prog *prog,
const void *ctx)
{
u32 ret;
migrate_disable();
ret = __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nopfunc);
migrate_enable();
return ret;
}
#define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN
......@@ -655,6 +677,7 @@ static inline u8 *bpf_skb_cb(struct sk_buff *skb)
return qdisc_skb_cb(skb)->data;
}
/* Must be invoked with migration disabled */
static inline u32 __bpf_prog_run_save_cb(const struct bpf_prog *prog,
struct sk_buff *skb)
{
......@@ -680,9 +703,9 @@ static inline u32 bpf_prog_run_save_cb(const struct bpf_prog *prog,
{
u32 res;
preempt_disable();
migrate_disable();
res = __bpf_prog_run_save_cb(prog, skb);
preempt_enable();
migrate_enable();
return res;
}
......@@ -695,9 +718,7 @@ static inline u32 bpf_prog_run_clear_cb(const struct bpf_prog *prog,
if (unlikely(prog->cb_access))
memset(cb_data, 0, BPF_SKB_CB_LEN);
preempt_disable();
res = BPF_PROG_RUN(prog, skb);
preempt_enable();
res = bpf_prog_run_pin_on_cpu(prog, skb);
return res;
}
......
This diff is collapsed.
......@@ -34,7 +34,7 @@ struct lpm_trie {
size_t n_entries;
size_t max_prefixlen;
size_t data_size;
raw_spinlock_t lock;
spinlock_t lock;
};
/* This trie implements a longest prefix match algorithm that can be used to
......@@ -315,7 +315,7 @@ static int trie_update_elem(struct bpf_map *map,
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
raw_spin_lock_irqsave(&trie->lock, irq_flags);
spin_lock_irqsave(&trie->lock, irq_flags);
/* Allocate and fill a new node */
......@@ -422,7 +422,7 @@ static int trie_update_elem(struct bpf_map *map,
kfree(im_node);
}
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
spin_unlock_irqrestore(&trie->lock, irq_flags);
return ret;
}
......@@ -442,7 +442,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
if (key->prefixlen > trie->max_prefixlen)
return -EINVAL;
raw_spin_lock_irqsave(&trie->lock, irq_flags);
spin_lock_irqsave(&trie->lock, irq_flags);
/* Walk the tree looking for an exact key/length match and keeping
* track of the path we traverse. We will need to know the node
......@@ -518,7 +518,7 @@ static int trie_delete_elem(struct bpf_map *map, void *_key)
kfree_rcu(node, rcu);
out:
raw_spin_unlock_irqrestore(&trie->lock, irq_flags);
spin_unlock_irqrestore(&trie->lock, irq_flags);
return ret;
}
......@@ -575,7 +575,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
if (ret)
goto out_err;
raw_spin_lock_init(&trie->lock);
spin_lock_init(&trie->lock);
return &trie->map;
out_err:
......
......@@ -25,12 +25,18 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
free_percpu(s->freelist);
}
static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
head->first = node;
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
raw_spin_lock(&head->lock);
node->next = head->first;
head->first = node;
pcpu_freelist_push_node(head, node);
raw_spin_unlock(&head->lock);
}
......@@ -56,21 +62,16 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
unsigned long flags;
int i, cpu, pcpu_entries;
pcpu_entries = nr_elems / num_possible_cpus() + 1;
i = 0;
/* disable irq to workaround lockdep false positive
* in bpf usage pcpu_freelist_populate() will never race
* with pcpu_freelist_push()
*/
local_irq_save(flags);
for_each_possible_cpu(cpu) {
again:
head = per_cpu_ptr(s->freelist, cpu);
___pcpu_freelist_push(head, buf);
/* No locking required as this is not visible yet. */
pcpu_freelist_push_node(head, buf);
i++;
buf += elem_size;
if (i == nr_elems)
......@@ -78,7 +79,6 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
if (i % pcpu_entries)
goto again;
}
local_irq_restore(flags);
}
struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
......
......@@ -40,6 +40,9 @@ static void do_up_read(struct irq_work *entry)
{
struct stack_map_irq_work *work;
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
return;
work = container_of(entry, struct stack_map_irq_work, irq_work);
up_read_non_owner(work->sem);
work->sem = NULL;
......@@ -288,10 +291,19 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
struct stack_map_irq_work *work = NULL;
if (irqs_disabled()) {
work = this_cpu_ptr(&up_read_work);
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY)
/* cannot queue more up_read, fallback */
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
work = this_cpu_ptr(&up_read_work);
if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
} else {
/*
* PREEMPT_RT does not allow to trylock mmap sem in
* interrupt disabled context. Force the fallback code.
*/
irq_work_busy = true;
}
}
/*
......
......@@ -171,11 +171,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
flags);
}
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, flags);
......@@ -206,8 +202,7 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = map->ops->map_update_elem(map, key, value, flags);
rcu_read_unlock();
}
__this_cpu_dec(bpf_prog_active);
preempt_enable();
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
return err;
......@@ -222,8 +217,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
if (bpf_map_is_dev_bound(map))
return bpf_map_offload_lookup_elem(map, key, value);
preempt_disable();
this_cpu_inc(bpf_prog_active);
bpf_disable_instrumentation();
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value);
......@@ -268,8 +262,7 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
rcu_read_unlock();
}
this_cpu_dec(bpf_prog_active);
preempt_enable();
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
return err;
......@@ -1136,13 +1129,11 @@ static int map_delete_elem(union bpf_attr *attr)
goto out;
}
preempt_disable();
__this_cpu_inc(bpf_prog_active);
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
out:
kfree(key);
......@@ -1254,13 +1245,11 @@ int generic_map_delete_batch(struct bpf_map *map,
break;
}
preempt_disable();
__this_cpu_inc(bpf_prog_active);
bpf_disable_instrumentation();
rcu_read_lock();
err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
bpf_enable_instrumentation();
maybe_wait_bpf_programs(map);
if (err)
break;
......
......@@ -367,8 +367,9 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
mutex_unlock(&trampoline_mutex);
}
/* The logic is similar to BPF_PROG_RUN, but with explicit rcu and preempt that
* are needed for trampoline. The macro is split into
/* The logic is similar to BPF_PROG_RUN, but with an explicit
* rcu_read_lock() and migrate_disable() which are required
* for the trampoline. The macro is split into
* call _bpf_prog_enter
* call prog->bpf_func
* call __bpf_prog_exit
......@@ -378,7 +379,7 @@ u64 notrace __bpf_prog_enter(void)
u64 start = 0;
rcu_read_lock();
preempt_disable();
migrate_disable();
if (static_branch_unlikely(&bpf_stats_enabled_key))
start = sched_clock();
return start;
......@@ -401,7 +402,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
stats->nsecs += sched_clock() - start;
u64_stats_update_end(&stats->syncp);
}
preempt_enable();
migrate_enable();
rcu_read_unlock();
}
......
......@@ -8143,26 +8143,48 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
}
}
static bool is_preallocated_map(struct bpf_map *map)
{
if (!check_map_prealloc(map))
return false;
if (map->inner_map_meta && !check_map_prealloc(map->inner_map_meta))
return false;
return true;
}
static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map,
struct bpf_prog *prog)
{
/* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use
* preallocated hash maps, since doing memory allocation
* in overflow_handler can crash depending on where nmi got
* triggered.
/*
* Validate that trace type programs use preallocated hash maps.
*
* For programs attached to PERF events this is mandatory as the
* perf NMI can hit any arbitrary code sequence.
*
* All other trace types using preallocated hash maps are unsafe as
* well because tracepoint or kprobes can be inside locked regions
* of the memory allocator or at a place where a recursion into the
* memory allocator would see inconsistent state.
*
* On RT enabled kernels run-time allocation of all trace type
* programs is strictly prohibited due to lock type constraints. On
* !RT kernels it is allowed for backwards compatibility reasons for
* now, but warnings are emitted so developers are made aware of
* the unsafety and can fix their programs before this is enforced.
*/
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
if (!check_map_prealloc(map)) {
if (is_tracing_prog_type(prog->type) && !is_preallocated_map(map)) {
if (prog->type == BPF_PROG_TYPE_PERF_EVENT) {
verbose(env, "perf_event programs can only use preallocated hash map\n");
return -EINVAL;
}
if (map->inner_map_meta &&
!check_map_prealloc(map->inner_map_meta)) {
verbose(env, "perf_event programs can only use preallocated inner hash map\n");
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
verbose(env, "trace type programs can only use preallocated hash map\n");
return -EINVAL;
}
WARN_ONCE(1, "trace type BPF program uses run-time allocation\n");
verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n");
}
if ((is_tracing_prog_type(prog->type) ||
......
......@@ -9206,7 +9206,6 @@ static void bpf_overflow_handler(struct perf_event *event,
int ret = 0;
ctx.regs = perf_arch_bpf_user_pt_regs(regs);
preempt_disable();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
goto out;
rcu_read_lock();
......@@ -9214,7 +9213,6 @@ static void bpf_overflow_handler(struct perf_event *event,
rcu_read_unlock();
out:
__this_cpu_dec(bpf_prog_active);
preempt_enable();
if (!ret)
return;
......
......@@ -268,16 +268,14 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd,
* All filters in the list are evaluated and the lowest BPF return
* value always takes priority (ignoring the DATA).
*/
preempt_disable();
for (; f; f = f->prev) {
u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
u32 cur_ret = bpf_prog_run_pin_on_cpu(f->prog, sd);
if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
ret = cur_ret;
*match = f;
}
}
preempt_enable();
return ret;
}
#endif /* CONFIG_SECCOMP_FILTER */
......
......@@ -83,7 +83,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
if (in_nmi()) /* not supported yet */
return 1;
preempt_disable();
cant_sleep();
if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) {
/*
......@@ -115,11 +115,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx)
out:
__this_cpu_dec(bpf_prog_active);
preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(trace_call_bpf);
#ifdef CONFIG_BPF_KPROBE_OVERRIDE
BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc)
......@@ -1516,10 +1514,9 @@ void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp)
static __always_inline
void __bpf_trace_run(struct bpf_prog *prog, u64 *args)
{
cant_sleep();
rcu_read_lock();
preempt_disable();
(void) BPF_PROG_RUN(prog, args);
preempt_enable();
rcu_read_unlock();
}
......
......@@ -1333,8 +1333,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
int size, esize;
int rctx;
if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs))
return;
if (bpf_prog_array_valid(call)) {
u32 ret;
preempt_disable();
ret = trace_call_bpf(call, regs);
preempt_enable();
if (!ret)
return;
}
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
......
......@@ -6660,14 +6660,14 @@ static int __run_one(const struct bpf_prog *fp, const void *data,
u64 start, finish;
int ret = 0, i;
preempt_disable();
migrate_disable();
start = ktime_get_ns();
for (i = 0; i < runs; i++)
ret = BPF_PROG_RUN(fp, data);
finish = ktime_get_ns();
preempt_enable();
migrate_enable();
*duration = finish - start;
do_div(*duration, runs);
......
......@@ -37,7 +37,7 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
repeat = 1;
rcu_read_lock();
preempt_disable();
migrate_disable();
time_start = ktime_get_ns();
for (i = 0; i < repeat; i++) {
bpf_cgroup_storage_set(storage);
......@@ -54,18 +54,18 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
if (need_resched()) {
time_spent += ktime_get_ns() - time_start;
preempt_enable();
migrate_enable();
rcu_read_unlock();
cond_resched();
rcu_read_lock();
preempt_disable();
migrate_disable();
time_start = ktime_get_ns();
}
}
time_spent += ktime_get_ns() - time_start;
preempt_enable();
migrate_enable();
rcu_read_unlock();
do_div(time_spent, repeat);
......
......@@ -920,9 +920,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
(int)FLOW_DISSECTOR_F_STOP_AT_ENCAP);
flow_keys->flags = flags;
preempt_disable();
result = BPF_PROG_RUN(prog, ctx);
preempt_enable();
result = bpf_prog_run_pin_on_cpu(prog, ctx);
flow_keys->nhoff = clamp_t(u16, flow_keys->nhoff, nhoff, hlen);
flow_keys->thoff = clamp_t(u16, flow_keys->thoff,
......
......@@ -628,7 +628,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
struct bpf_prog *prog;
int ret;
preempt_disable();
rcu_read_lock();
prog = READ_ONCE(psock->progs.msg_parser);
if (unlikely(!prog)) {
......@@ -638,7 +637,7 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
sk_msg_compute_data_pointers(msg);
msg->sk = sk;
ret = BPF_PROG_RUN(prog, msg);
ret = bpf_prog_run_pin_on_cpu(prog, msg);
ret = sk_psock_map_verd(ret, msg->sk_redir);
psock->apply_bytes = msg->apply_bytes;
if (ret == __SK_REDIRECT) {
......@@ -653,7 +652,6 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
}
out:
rcu_read_unlock();
preempt_enable();
return ret;
}
EXPORT_SYMBOL_GPL(sk_psock_msg_verdict);
......@@ -665,9 +663,7 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog,
skb->sk = psock->sk;
bpf_compute_data_end_sk_skb(skb);
preempt_disable();
ret = BPF_PROG_RUN(prog, skb);
preempt_enable();
ret = bpf_prog_run_pin_on_cpu(prog, skb);
/* strparser clones the skb before handing it to a upper layer,
* meaning skb_orphan has been called. We NULL sk on the way out
* to ensure we don't trigger a BUG_ON() in skb/sk operations
......
......@@ -380,9 +380,7 @@ static int kcm_parse_func_strparser(struct strparser *strp, struct sk_buff *skb)
struct bpf_prog *prog = psock->bpf_prog;
int res;
preempt_disable();
res = BPF_PROG_RUN(prog, skb);
preempt_enable();
res = bpf_prog_run_pin_on_cpu(prog, skb);
return res;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment