Commit d83525ca authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by Daniel Borkmann

bpf: introduce bpf_spin_lock

Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let
bpf program serialize access to other variables.

Example:
struct hash_elem {
    int cnt;
    struct bpf_spin_lock lock;
};
struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key);
if (val) {
    bpf_spin_lock(&val->lock);
    val->cnt++;
    bpf_spin_unlock(&val->lock);
}

Restrictions and safety checks:
- bpf_spin_lock is only allowed inside HASH and ARRAY maps.
- BTF description of the map is mandatory for safety analysis.
- bpf program can take one bpf_spin_lock at a time, since two or more can
  cause dead locks.
- only one 'struct bpf_spin_lock' is allowed per map element.
  It drastically simplifies implementation yet allows bpf program to use
  any number of bpf_spin_locks.
- when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed.
- bpf program must bpf_spin_unlock() before return.
- bpf program can access 'struct bpf_spin_lock' only via
  bpf_spin_lock()/bpf_spin_unlock() helpers.
- load/store into 'struct bpf_spin_lock lock;' field is not allowed.
- to use bpf_spin_lock() helper the BTF description of map value must be
  a struct and have 'struct bpf_spin_lock anyname;' field at the top level.
  Nested lock inside another struct is not allowed.
- syscall map_lookup doesn't copy bpf_spin_lock field to user space.
- syscall map_update and program map_update do not update bpf_spin_lock field.
- bpf_spin_lock cannot be on the stack or inside networking packet.
  bpf_spin_lock can only be inside HASH or ARRAY map value.
- bpf_spin_lock is available to root only and to all program types.
- bpf_spin_lock is not allowed in inner maps of map-in-map.
- ld_abs is not allowed inside spin_lock-ed region.
- tracing progs and socket filter progs cannot use bpf_spin_lock due to
  insufficient preemption checks

Implementation details:
- cgroup-bpf class of programs can nest with xdp/tc programs.
  Hence bpf_spin_lock is equivalent to spin_lock_irqsave.
  Other solutions to avoid nested bpf_spin_lock are possible.
  Like making sure that all networking progs run with softirq disabled.
  spin_lock_irqsave is the simplest and doesn't add overhead to the
  programs that don't use it.
- arch_spinlock_t is used when its implemented as queued_spin_lock
- archs can force their own arch_spinlock_t
- on architectures where queued_spin_lock is not available and
  sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used.
- presence of bpf_spin_lock inside map value could have been indicated via
  extra flag during map_create, but specifying it via BTF is cleaner.
  It provides introspection for map key/value and reduces user mistakes.

Next steps:
- allow bpf_spin_lock in other map types (like cgroup local storage)
- introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper
  to request kernel to grab bpf_spin_lock before rewriting the value.
  That will serialize access to map elements.
Acked-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent 1832f4ef
...@@ -72,14 +72,15 @@ struct bpf_map { ...@@ -72,14 +72,15 @@ struct bpf_map {
u32 value_size; u32 value_size;
u32 max_entries; u32 max_entries;
u32 map_flags; u32 map_flags;
u32 pages; int spin_lock_off; /* >=0 valid offset, <0 error */
u32 id; u32 id;
int numa_node; int numa_node;
u32 btf_key_type_id; u32 btf_key_type_id;
u32 btf_value_type_id; u32 btf_value_type_id;
struct btf *btf; struct btf *btf;
u32 pages;
bool unpriv_array; bool unpriv_array;
/* 55 bytes hole */ /* 51 bytes hole */
/* The 3rd and 4th cacheline with misc members to avoid false sharing /* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting. * particularly with refcounting.
...@@ -91,6 +92,34 @@ struct bpf_map { ...@@ -91,6 +92,34 @@ struct bpf_map {
char name[BPF_OBJ_NAME_LEN]; char name[BPF_OBJ_NAME_LEN];
}; };
static inline bool map_value_has_spin_lock(const struct bpf_map *map)
{
return map->spin_lock_off >= 0;
}
static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
{
if (likely(!map_value_has_spin_lock(map)))
return;
*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
(struct bpf_spin_lock){};
}
/* copy everything but bpf_spin_lock */
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
{
if (unlikely(map_value_has_spin_lock(map))) {
u32 off = map->spin_lock_off;
memcpy(dst, src, off);
memcpy(dst + off + sizeof(struct bpf_spin_lock),
src + off + sizeof(struct bpf_spin_lock),
map->value_size - off - sizeof(struct bpf_spin_lock));
} else {
memcpy(dst, src, map->value_size);
}
}
struct bpf_offload_dev; struct bpf_offload_dev;
struct bpf_offloaded_map; struct bpf_offloaded_map;
...@@ -162,6 +191,7 @@ enum bpf_arg_type { ...@@ -162,6 +191,7 @@ enum bpf_arg_type {
ARG_PTR_TO_CTX, /* pointer to context */ ARG_PTR_TO_CTX, /* pointer to context */
ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_ANYTHING, /* any (initialized) argument is ok */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
}; };
/* type of values returned from helper functions */ /* type of values returned from helper functions */
...@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; ...@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
extern const struct bpf_func_proto bpf_sk_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
extern const struct bpf_func_proto bpf_spin_lock_proto;
extern const struct bpf_func_proto bpf_spin_unlock_proto;
extern const struct bpf_func_proto bpf_get_local_storage_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto;
/* Shared helpers among cBPF and eBPF. */ /* Shared helpers among cBPF and eBPF. */
......
...@@ -148,6 +148,7 @@ struct bpf_verifier_state { ...@@ -148,6 +148,7 @@ struct bpf_verifier_state {
/* call stack tracking */ /* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES]; struct bpf_func_state *frame[MAX_CALL_FRAMES];
u32 curframe; u32 curframe;
u32 active_spin_lock;
bool speculative; bool speculative;
}; };
......
...@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf); ...@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
const struct btf_member *m, const struct btf_member *m,
u32 expected_offset, u32 expected_size); u32 expected_offset, u32 expected_size);
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
......
...@@ -2422,7 +2422,9 @@ union bpf_attr { ...@@ -2422,7 +2422,9 @@ union bpf_attr {
FN(map_peek_elem), \ FN(map_peek_elem), \
FN(msg_push_data), \ FN(msg_push_data), \
FN(msg_pop_data), \ FN(msg_pop_data), \
FN(rc_pointer_rel), FN(rc_pointer_rel), \
FN(spin_lock), \
FN(spin_unlock),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -3056,4 +3058,7 @@ struct bpf_line_info { ...@@ -3056,4 +3058,7 @@ struct bpf_line_info {
__u32 line_col; __u32 line_col;
}; };
struct bpf_spin_lock {
__u32 val;
};
#endif /* _UAPI__LINUX_BPF_H__ */ #endif /* _UAPI__LINUX_BPF_H__ */
...@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS ...@@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS
def_bool y if ARCH_USE_QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS
depends on SMP depends on SMP
config BPF_ARCH_SPINLOCK
bool
config ARCH_USE_QUEUED_RWLOCKS config ARCH_USE_QUEUED_RWLOCKS
bool bool
......
...@@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
value, map->value_size); value, map->value_size);
else else
memcpy(array->value + copy_map_value(map,
array->elem_size * (index & array->index_mask), array->value +
value, map->value_size); array->elem_size * (index & array->index_mask),
value);
return 0; return 0;
} }
......
...@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) ...@@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t)
return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION;
} }
static bool __btf_type_is_struct(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT;
}
static bool btf_type_is_array(const struct btf_type *t) static bool btf_type_is_array(const struct btf_type *t)
{ {
return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY;
...@@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env, ...@@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env,
btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t));
} }
/* find 'struct bpf_spin_lock' in map value.
* return >= 0 offset if found
* and < 0 in case of error
*/
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t)
{
const struct btf_member *member;
u32 i, off = -ENOENT;
if (!__btf_type_is_struct(t))
return -EINVAL;
for_each_member(i, t, member) {
const struct btf_type *member_type = btf_type_by_id(btf,
member->type);
if (!__btf_type_is_struct(member_type))
continue;
if (member_type->size != sizeof(struct bpf_spin_lock))
continue;
if (strcmp(__btf_name_by_offset(btf, member_type->name_off),
"bpf_spin_lock"))
continue;
if (off != -ENOENT)
/* only one 'struct bpf_spin_lock' is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
off /= 8;
if (off % __alignof__(struct bpf_spin_lock))
/* valid struct bpf_spin_lock will be 4 byte aligned */
return -EINVAL;
}
return off;
}
static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t,
u32 type_id, void *data, u8 bits_offset, u32 type_id, void *data, u8 bits_offset,
struct seq_file *m) struct seq_file *m)
......
...@@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; ...@@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak;
const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak;
const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak;
const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak;
const struct bpf_func_proto bpf_spin_lock_proto __weak;
const struct bpf_func_proto bpf_spin_unlock_proto __weak;
const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak;
const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak;
......
...@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) ...@@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
BITS_PER_LONG == 64; BITS_PER_LONG == 64;
} }
static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
{
u32 size = htab->map.value_size;
if (percpu || fd_htab_map_needs_adjust(htab))
size = round_up(size, 8);
return size;
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash, void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus, bool percpu, bool onallcpus,
struct htab_elem *old_elem) struct htab_elem *old_elem)
{ {
u32 size = htab_size_value(htab, percpu); u32 size = htab->map.value_size;
bool prealloc = htab_is_prealloc(htab); bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new; struct htab_elem *l_new, **pl_new;
void __percpu *pptr; void __percpu *pptr;
...@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, ...@@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
l_new = ERR_PTR(-ENOMEM); l_new = ERR_PTR(-ENOMEM);
goto dec_count; goto dec_count;
} }
check_and_init_map_lock(&htab->map,
l_new->key + round_up(key_size, 8));
} }
memcpy(l_new->key, key, key_size); memcpy(l_new->key, key, key_size);
if (percpu) { if (percpu) {
size = round_up(size, 8);
if (prealloc) { if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size); pptr = htab_elem_get_ptr(l_new, key_size);
} else { } else {
...@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, ...@@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
if (!prealloc) if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr); htab_elem_set_ptr(l_new, key_size, pptr);
} else { } else if (fd_htab_map_needs_adjust(htab)) {
size = round_up(size, 8);
memcpy(l_new->key + round_up(key_size, 8), value, size); memcpy(l_new->key + round_up(key_size, 8), value, size);
} else {
copy_map_value(&htab->map,
l_new->key + round_up(key_size, 8),
value);
} }
l_new->hash = hash; l_new->hash = hash;
......
...@@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { ...@@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = {
.arg2_type = ARG_CONST_SIZE, .arg2_type = ARG_CONST_SIZE,
}; };
#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK)
static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
{
arch_spinlock_t *l = (void *)lock;
union {
__u32 val;
arch_spinlock_t lock;
} u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED };
compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0");
BUILD_BUG_ON(sizeof(*l) != sizeof(__u32));
BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32));
arch_spin_lock(l);
}
static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
{
arch_spinlock_t *l = (void *)lock;
arch_spin_unlock(l);
}
#else
static inline void __bpf_spin_lock(struct bpf_spin_lock *lock)
{
atomic_t *l = (void *)lock;
BUILD_BUG_ON(sizeof(*l) != sizeof(*lock));
do {
atomic_cond_read_relaxed(l, !VAL);
} while (atomic_xchg(l, 1));
}
static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock)
{
atomic_t *l = (void *)lock;
atomic_set_release(l, 0);
}
#endif
static DEFINE_PER_CPU(unsigned long, irqsave_flags);
notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock)
{
unsigned long flags;
local_irq_save(flags);
__bpf_spin_lock(lock);
__this_cpu_write(irqsave_flags, flags);
return 0;
}
const struct bpf_func_proto bpf_spin_lock_proto = {
.func = bpf_spin_lock,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};
notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock)
{
unsigned long flags;
flags = __this_cpu_read(irqsave_flags);
__bpf_spin_unlock(lock);
local_irq_restore(flags);
return 0;
}
const struct bpf_func_proto bpf_spin_unlock_proto = {
.func = bpf_spin_unlock,
.gpl_only = false,
.ret_type = RET_VOID,
.arg1_type = ARG_PTR_TO_SPIN_LOCK,
};
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id) BPF_CALL_0(bpf_get_current_cgroup_id)
{ {
......
...@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) ...@@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
} }
if (map_value_has_spin_lock(inner_map)) {
fdput(f);
return ERR_PTR(-ENOTSUPP);
}
inner_map_meta_size = sizeof(*inner_map_meta); inner_map_meta_size = sizeof(*inner_map_meta);
/* In some cases verifier needs to access beyond just base map. */ /* In some cases verifier needs to access beyond just base map. */
if (inner_map->ops == &array_map_ops) if (inner_map->ops == &array_map_ops)
......
...@@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map, ...@@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map,
return -ENOTSUPP; return -ENOTSUPP;
} }
static int map_check_btf(const struct bpf_map *map, const struct btf *btf, static int map_check_btf(struct bpf_map *map, const struct btf *btf,
u32 btf_key_id, u32 btf_value_id) u32 btf_key_id, u32 btf_value_id)
{ {
const struct btf_type *key_type, *value_type; const struct btf_type *key_type, *value_type;
...@@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, ...@@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf,
if (!value_type || value_size != map->value_size) if (!value_type || value_size != map->value_size)
return -EINVAL; return -EINVAL;
map->spin_lock_off = btf_find_spin_lock(btf, value_type);
if (map_value_has_spin_lock(map)) {
if (map->map_type != BPF_MAP_TYPE_HASH &&
map->map_type != BPF_MAP_TYPE_ARRAY)
return -ENOTSUPP;
if (map->spin_lock_off + sizeof(struct bpf_spin_lock) >
map->value_size) {
WARN_ONCE(1,
"verifier bug spin_lock_off %d value_size %d\n",
map->spin_lock_off, map->value_size);
return -EFAULT;
}
}
if (map->ops->map_check_btf) if (map->ops->map_check_btf)
ret = map->ops->map_check_btf(map, btf, key_type, value_type); ret = map->ops->map_check_btf(map, btf, key_type, value_type);
...@@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr) ...@@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr)
map->btf = btf; map->btf = btf;
map->btf_key_type_id = attr->btf_key_type_id; map->btf_key_type_id = attr->btf_key_type_id;
map->btf_value_type_id = attr->btf_value_type_id; map->btf_value_type_id = attr->btf_value_type_id;
} else {
map->spin_lock_off = -EINVAL;
} }
err = security_bpf_map_alloc(map); err = security_bpf_map_alloc(map);
...@@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr)
err = -ENOENT; err = -ENOENT;
} else { } else {
err = 0; err = 0;
memcpy(value, ptr, value_size); copy_map_value(map, value, ptr);
} }
rcu_read_unlock(); rcu_read_unlock();
} }
......
...@@ -213,6 +213,7 @@ struct bpf_call_arg_meta { ...@@ -213,6 +213,7 @@ struct bpf_call_arg_meta {
s64 msize_smax_value; s64 msize_smax_value;
u64 msize_umax_value; u64 msize_umax_value;
int ptr_id; int ptr_id;
int func_id;
}; };
static DEFINE_MUTEX(bpf_verifier_lock); static DEFINE_MUTEX(bpf_verifier_lock);
...@@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg) ...@@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg)
return type_is_refcounted(reg->type); return type_is_refcounted(reg->type);
} }
static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
{
return reg->type == PTR_TO_MAP_VALUE &&
map_value_has_spin_lock(reg->map_ptr);
}
static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg)
{ {
return type_is_refcounted_or_null(reg->type); return type_is_refcounted_or_null(reg->type);
...@@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, ...@@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
} }
dst_state->speculative = src->speculative; dst_state->speculative = src->speculative;
dst_state->curframe = src->curframe; dst_state->curframe = src->curframe;
dst_state->active_spin_lock = src->active_spin_lock;
for (i = 0; i <= src->curframe; i++) { for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i]; dst = dst_state->frame[i];
if (!dst) { if (!dst) {
...@@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, ...@@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
if (err) if (err)
verbose(env, "R%d max value is outside of the array range\n", verbose(env, "R%d max value is outside of the array range\n",
regno); regno);
if (map_value_has_spin_lock(reg->map_ptr)) {
u32 lock = reg->map_ptr->spin_lock_off;
/* if any part of struct bpf_spin_lock can be touched by
* load/store reject this program.
* To check that [x1, x2) overlaps with [y1, y2)
* it is sufficient to check x1 < y2 && y1 < x2.
*/
if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) &&
lock < reg->umax_value + off + size) {
verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n");
return -EACCES;
}
}
return err; return err;
} }
...@@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, ...@@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
} }
} }
/* Implementation details:
* bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL
* Two bpf_map_lookups (even with the same key) will have different reg->id.
* For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after
* value_or_null->value transition, since the verifier only cares about
* the range of access to valid map value pointer and doesn't care about actual
* address of the map element.
* For maps with 'struct bpf_spin_lock' inside map value the verifier keeps
* reg->id > 0 after value_or_null->value transition. By doing so
* two bpf_map_lookups will be considered two different pointers that
* point to different bpf_spin_locks.
* The verifier allows taking only one bpf_spin_lock at a time to avoid
* dead-locks.
* Since only one bpf_spin_lock is allowed the checks are simpler than
* reg_is_refcounted() logic. The verifier needs to remember only
* one spin_lock instead of array of acquired_refs.
* cur_state->active_spin_lock remembers which map value element got locked
* and clears it after bpf_spin_unlock.
*/
static int process_spin_lock(struct bpf_verifier_env *env, int regno,
bool is_lock)
{
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
struct bpf_verifier_state *cur = env->cur_state;
bool is_const = tnum_is_const(reg->var_off);
struct bpf_map *map = reg->map_ptr;
u64 val = reg->var_off.value;
if (reg->type != PTR_TO_MAP_VALUE) {
verbose(env, "R%d is not a pointer to map_value\n", regno);
return -EINVAL;
}
if (!is_const) {
verbose(env,
"R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n",
regno);
return -EINVAL;
}
if (!map->btf) {
verbose(env,
"map '%s' has to have BTF in order to use bpf_spin_lock\n",
map->name);
return -EINVAL;
}
if (!map_value_has_spin_lock(map)) {
if (map->spin_lock_off == -E2BIG)
verbose(env,
"map '%s' has more than one 'struct bpf_spin_lock'\n",
map->name);
else if (map->spin_lock_off == -ENOENT)
verbose(env,
"map '%s' doesn't have 'struct bpf_spin_lock'\n",
map->name);
else
verbose(env,
"map '%s' is not a struct type or bpf_spin_lock is mangled\n",
map->name);
return -EINVAL;
}
if (map->spin_lock_off != val + reg->off) {
verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n",
val + reg->off);
return -EINVAL;
}
if (is_lock) {
if (cur->active_spin_lock) {
verbose(env,
"Locking two bpf_spin_locks are not allowed\n");
return -EINVAL;
}
cur->active_spin_lock = reg->id;
} else {
if (!cur->active_spin_lock) {
verbose(env, "bpf_spin_unlock without taking a lock\n");
return -EINVAL;
}
if (cur->active_spin_lock != reg->id) {
verbose(env, "bpf_spin_unlock of different lock\n");
return -EINVAL;
}
cur->active_spin_lock = 0;
}
return 0;
}
static bool arg_type_is_mem_ptr(enum bpf_arg_type type) static bool arg_type_is_mem_ptr(enum bpf_arg_type type)
{ {
return type == ARG_PTR_TO_MEM || return type == ARG_PTR_TO_MEM ||
...@@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, ...@@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
return -EFAULT; return -EFAULT;
} }
meta->ptr_id = reg->id; meta->ptr_id = reg->id;
} else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true))
return -EACCES;
} else if (meta->func_id == BPF_FUNC_spin_unlock) {
if (process_spin_lock(env, regno, false))
return -EACCES;
} else {
verbose(env, "verifier internal error\n");
return -EFAULT;
}
} else if (arg_type_is_mem_ptr(arg_type)) { } else if (arg_type_is_mem_ptr(arg_type)) {
expected_type = PTR_TO_STACK; expected_type = PTR_TO_STACK;
/* One exception here. In case function allows for NULL to be /* One exception here. In case function allows for NULL to be
...@@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn ...@@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
return err; return err;
} }
meta.func_id = func_id;
/* check args */ /* check args */
err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta);
if (err) if (err)
...@@ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, ...@@ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
} else if (reg->type == PTR_TO_SOCKET_OR_NULL) { } else if (reg->type == PTR_TO_SOCKET_OR_NULL) {
reg->type = PTR_TO_SOCKET; reg->type = PTR_TO_SOCKET;
} }
if (is_null || !reg_is_refcounted(reg)) { if (is_null || !(reg_is_refcounted(reg) ||
reg_may_point_to_spin_lock(reg))) {
/* We don't need id from this point onwards anymore, /* We don't need id from this point onwards anymore,
* thus we should better reset it, so that state * thus we should better reset it, so that state
* pruning has chances to take effect. * pruning has chances to take effect.
...@@ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) ...@@ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
return err; return err;
} }
if (env->cur_state->active_spin_lock) {
verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n");
return -EINVAL;
}
if (regs[BPF_REG_6].type != PTR_TO_CTX) { if (regs[BPF_REG_6].type != PTR_TO_CTX) {
verbose(env, verbose(env,
"at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n");
...@@ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, ...@@ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
case PTR_TO_MAP_VALUE: case PTR_TO_MAP_VALUE:
/* If the new min/max/var_off satisfy the old ones and /* If the new min/max/var_off satisfy the old ones and
* everything else matches, we are OK. * everything else matches, we are OK.
* We don't care about the 'id' value, because nothing * 'id' is not compared, since it's only used for maps with
* uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) * bpf_spin_lock inside map element and in such cases if
* the rest of the prog is valid for one map element then
* it's valid for all map elements regardless of the key
* used in bpf_map_lookup()
*/ */
return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 &&
range_within(rold, rcur) && range_within(rold, rcur) &&
...@@ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env, ...@@ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->speculative && !cur->speculative) if (old->speculative && !cur->speculative)
return false; return false;
if (old->active_spin_lock != cur->active_spin_lock)
return false;
/* for states to be equal callsites have to be the same /* for states to be equal callsites have to be the same
* and all frame states need to be equivalent * and all frame states need to be equivalent
*/ */
...@@ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env) ...@@ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL; return -EINVAL;
} }
if (env->cur_state->active_spin_lock &&
(insn->src_reg == BPF_PSEUDO_CALL ||
insn->imm != BPF_FUNC_spin_unlock)) {
verbose(env, "function calls are not allowed while holding a lock\n");
return -EINVAL;
}
if (insn->src_reg == BPF_PSEUDO_CALL) if (insn->src_reg == BPF_PSEUDO_CALL)
err = check_func_call(env, insn, &env->insn_idx); err = check_func_call(env, insn, &env->insn_idx);
else else
...@@ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env) ...@@ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL; return -EINVAL;
} }
if (env->cur_state->active_spin_lock) {
verbose(env, "bpf_spin_unlock is missing\n");
return -EINVAL;
}
if (state->curframe) { if (state->curframe) {
/* exit from nested function */ /* exit from nested function */
env->prev_insn_idx = env->insn_idx; env->prev_insn_idx = env->insn_idx;
...@@ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map) ...@@ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map)
!(map->map_flags & BPF_F_NO_PREALLOC); !(map->map_flags & BPF_F_NO_PREALLOC);
} }
static bool is_tracing_prog_type(enum bpf_prog_type type)
{
switch (type) {
case BPF_PROG_TYPE_KPROBE:
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
return true;
default:
return false;
}
}
static int check_map_prog_compatibility(struct bpf_verifier_env *env, static int check_map_prog_compatibility(struct bpf_verifier_env *env,
struct bpf_map *map, struct bpf_map *map,
struct bpf_prog *prog) struct bpf_prog *prog)
...@@ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, ...@@ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
} }
} }
if ((is_tracing_prog_type(prog->type) ||
prog->type == BPF_PROG_TYPE_SOCKET_FILTER) &&
map_value_has_spin_lock(map)) {
verbose(env, "tracing progs cannot use bpf_spin_lock yet\n");
return -EINVAL;
}
if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) &&
!bpf_offload_prog_map_match(prog, map)) { !bpf_offload_prog_map_match(prog, map)) {
verbose(env, "offload device mismatch between prog and map\n"); verbose(env, "offload device mismatch between prog and map\n");
......
...@@ -5314,10 +5314,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) ...@@ -5314,10 +5314,20 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_tail_call_proto; return &bpf_tail_call_proto;
case BPF_FUNC_ktime_get_ns: case BPF_FUNC_ktime_get_ns:
return &bpf_ktime_get_ns_proto; return &bpf_ktime_get_ns_proto;
default:
break;
}
if (!capable(CAP_SYS_ADMIN))
return NULL;
switch (func_id) {
case BPF_FUNC_spin_lock:
return &bpf_spin_lock_proto;
case BPF_FUNC_spin_unlock:
return &bpf_spin_unlock_proto;
case BPF_FUNC_trace_printk: case BPF_FUNC_trace_printk:
if (capable(CAP_SYS_ADMIN)) return bpf_get_trace_printk_proto();
return bpf_get_trace_printk_proto();
/* else, fall through */
default: default:
return NULL; return NULL;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment