Commit 20c09d92 authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by Daniel Borkmann

bpf: Introduce kptr_rcu.

The life time of certain kernel structures like 'struct cgroup' is protected by RCU.
Hence it's safe to dereference them directly from __kptr tagged pointers in bpf maps.
The resulting pointer is MEM_RCU and can be passed to kfuncs that expect KF_RCU.
Derefrence of other kptr-s returns PTR_UNTRUSTED.

For example:
struct map_value {
   struct cgroup __kptr *cgrp;
};

SEC("tp_btf/cgroup_mkdir")
int BPF_PROG(test_cgrp_get_ancestors, struct cgroup *cgrp_arg, const char *path)
{
  struct cgroup *cg, *cg2;

  cg = bpf_cgroup_acquire(cgrp_arg); // cg is PTR_TRUSTED and ref_obj_id > 0
  bpf_kptr_xchg(&v->cgrp, cg);

  cg2 = v->cgrp; // This is new feature introduced by this patch.
  // cg2 is PTR_MAYBE_NULL | MEM_RCU.
  // When cg2 != NULL, it's a valid cgroup, but its percpu_ref could be zero

  if (cg2)
    bpf_cgroup_ancestor(cg2, level); // safe to do.
}
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarDavid Vernet <void@manifault.com>
Link: https://lore.kernel.org/bpf/20230303041446.3630-4-alexei.starovoitov@gmail.com
parent 8d093b4e
...@@ -249,11 +249,13 @@ added later. ...@@ -249,11 +249,13 @@ added later.
2.4.8 KF_RCU flag 2.4.8 KF_RCU flag
----------------- -----------------
The KF_RCU flag is used for kfuncs which have a rcu ptr as its argument. The KF_RCU flag is a weaker version of KF_TRUSTED_ARGS. The kfuncs marked with
When used together with KF_ACQUIRE, it indicates the kfunc should have a KF_RCU expect either PTR_TRUSTED or MEM_RCU arguments. The verifier guarantees
single argument which must be a trusted argument or a MEM_RCU pointer. that the objects are valid and there is no use-after-free. The pointers are not
The argument may have reference count of 0 and the kfunc must take this NULL, but the object's refcount could have reached zero. The kfuncs need to
into consideration. consider doing refcnt != 0 check, especially when returning a KF_ACQUIRE
pointer. Note as well that a KF_ACQUIRE kfunc that is KF_RCU should very likely
also be KF_RET_NULL.
.. _KF_deprecated_flag: .. _KF_deprecated_flag:
......
...@@ -70,7 +70,7 @@ ...@@ -70,7 +70,7 @@
#define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */ #define KF_TRUSTED_ARGS (1 << 4) /* kfunc only takes trusted pointer arguments */
#define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */ #define KF_SLEEPABLE (1 << 5) /* kfunc may sleep */
#define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */ #define KF_DESTRUCTIVE (1 << 6) /* kfunc performs destructive actions */
#define KF_RCU (1 << 7) /* kfunc only takes rcu pointer arguments */ #define KF_RCU (1 << 7) /* kfunc takes either rcu or trusted pointer arguments */
/* /*
* Tag marking a kernel function as a kfunc. This is meant to minimize the * Tag marking a kernel function as a kfunc. This is meant to minimize the
......
...@@ -2163,8 +2163,10 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) ...@@ -2163,8 +2163,10 @@ __bpf_kfunc struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level)
if (level > cgrp->level || level < 0) if (level > cgrp->level || level < 0)
return NULL; return NULL;
/* cgrp's refcnt could be 0 here, but ancestors can still be accessed */
ancestor = cgrp->ancestors[level]; ancestor = cgrp->ancestors[level];
cgroup_get(ancestor); if (!cgroup_tryget(ancestor))
return NULL;
return ancestor; return ancestor;
} }
...@@ -2382,7 +2384,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL) ...@@ -2382,7 +2384,7 @@ BTF_ID_FLAGS(func, bpf_rbtree_first, KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_cgroup_acquire, KF_ACQUIRE | KF_TRUSTED_ARGS)
BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_kptr_get, KF_ACQUIRE | KF_KPTR_GET | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_cgroup_release, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_ancestor, KF_ACQUIRE | KF_RCU | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_cgroup_from_id, KF_ACQUIRE | KF_RET_NULL)
#endif #endif
BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_task_from_pid, KF_ACQUIRE | KF_RET_NULL)
......
...@@ -4218,7 +4218,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, ...@@ -4218,7 +4218,7 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, u32 regno) struct bpf_reg_state *reg, u32 regno)
{ {
const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id); const char *targ_name = kernel_type_name(kptr_field->kptr.btf, kptr_field->kptr.btf_id);
int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED; int perm_flags = PTR_MAYBE_NULL | PTR_TRUSTED | MEM_RCU;
const char *reg_name = ""; const char *reg_name = "";
/* Only unreferenced case accepts untrusted pointers */ /* Only unreferenced case accepts untrusted pointers */
...@@ -4285,6 +4285,34 @@ static int map_kptr_match_type(struct bpf_verifier_env *env, ...@@ -4285,6 +4285,34 @@ static int map_kptr_match_type(struct bpf_verifier_env *env,
return -EINVAL; return -EINVAL;
} }
/* The non-sleepable programs and sleepable programs with explicit bpf_rcu_read_lock()
* can dereference RCU protected pointers and result is PTR_TRUSTED.
*/
static bool in_rcu_cs(struct bpf_verifier_env *env)
{
return env->cur_state->active_rcu_lock || !env->prog->aux->sleepable;
}
/* Once GCC supports btf_type_tag the following mechanism will be replaced with tag check */
BTF_SET_START(rcu_protected_types)
BTF_ID(struct, prog_test_ref_kfunc)
BTF_ID(struct, cgroup)
BTF_SET_END(rcu_protected_types)
static bool rcu_protected_object(const struct btf *btf, u32 btf_id)
{
if (!btf_is_kernel(btf))
return false;
return btf_id_set_contains(&rcu_protected_types, btf_id);
}
static bool rcu_safe_kptr(const struct btf_field *field)
{
const struct btf_field_kptr *kptr = &field->kptr;
return field->type == BPF_KPTR_REF && rcu_protected_object(kptr->btf, kptr->btf_id);
}
static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
int value_regno, int insn_idx, int value_regno, int insn_idx,
struct btf_field *kptr_field) struct btf_field *kptr_field)
...@@ -4319,7 +4347,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno, ...@@ -4319,7 +4347,10 @@ static int check_map_kptr_access(struct bpf_verifier_env *env, u32 regno,
* value from map as PTR_TO_BTF_ID, with the correct type. * value from map as PTR_TO_BTF_ID, with the correct type.
*/ */
mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf, mark_btf_ld_reg(env, cur_regs(env), value_regno, PTR_TO_BTF_ID, kptr_field->kptr.btf,
kptr_field->kptr.btf_id, PTR_MAYBE_NULL | PTR_UNTRUSTED); kptr_field->kptr.btf_id,
rcu_safe_kptr(kptr_field) && in_rcu_cs(env) ?
PTR_MAYBE_NULL | MEM_RCU :
PTR_MAYBE_NULL | PTR_UNTRUSTED);
/* For mark_ptr_or_null_reg */ /* For mark_ptr_or_null_reg */
val_reg->id = ++env->id_gen; val_reg->id = ++env->id_gen;
} else if (class == BPF_STX) { } else if (class == BPF_STX) {
...@@ -5163,10 +5194,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, ...@@ -5163,10 +5194,17 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
* An RCU-protected pointer can also be deemed trusted if we are in an * An RCU-protected pointer can also be deemed trusted if we are in an
* RCU read region. This case is handled below. * RCU read region. This case is handled below.
*/ */
if (nested_ptr_is_trusted(env, reg, off)) if (nested_ptr_is_trusted(env, reg, off)) {
flag |= PTR_TRUSTED; flag |= PTR_TRUSTED;
else /*
* task->cgroups is trusted. It provides a stronger guarantee
* than __rcu tag on 'cgroups' field in 'struct task_struct'.
* Clear MEM_RCU in such case.
*/
flag &= ~MEM_RCU;
} else {
flag &= ~PTR_TRUSTED; flag &= ~PTR_TRUSTED;
}
if (flag & MEM_RCU) { if (flag & MEM_RCU) {
/* Mark value register as MEM_RCU only if it is protected by /* Mark value register as MEM_RCU only if it is protected by
...@@ -5175,11 +5213,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, ...@@ -5175,11 +5213,10 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env,
* read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since * read lock region. Also mark rcu pointer as PTR_MAYBE_NULL since
* it could be null in some cases. * it could be null in some cases.
*/ */
if (!env->cur_state->active_rcu_lock || if (in_rcu_cs(env) && (is_trusted_reg(reg) || is_rcu_reg(reg)))
!(is_trusted_reg(reg) || is_rcu_reg(reg)))
flag &= ~MEM_RCU;
else
flag |= PTR_MAYBE_NULL; flag |= PTR_MAYBE_NULL;
else
flag &= ~MEM_RCU;
} else if (reg->type & MEM_RCU) { } else if (reg->type & MEM_RCU) {
/* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged /* ptr (reg) is marked as MEM_RCU, but the struct field is not tagged
* with __rcu. Mark the flag as PTR_UNTRUSTED conservatively. * with __rcu. Mark the flag as PTR_UNTRUSTED conservatively.
...@@ -9676,7 +9713,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_ ...@@ -9676,7 +9713,7 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL; return -EINVAL;
} }
if (is_kfunc_trusted_args(meta) && if ((is_kfunc_trusted_args(meta) || is_kfunc_rcu(meta)) &&
(register_is_null(reg) || type_may_be_null(reg->type))) { (register_is_null(reg) || type_may_be_null(reg->type))) {
verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i); verbose(env, "Possibly NULL pointer passed to trusted arg%d\n", i);
return -EACCES; return -EACCES;
......
...@@ -737,6 +737,7 @@ __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len) ...@@ -737,6 +737,7 @@ __bpf_kfunc void bpf_kfunc_call_test_mem_len_fail2(u64 *mem, int len)
__bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p) __bpf_kfunc void bpf_kfunc_call_test_ref(struct prog_test_ref_kfunc *p)
{ {
/* p != NULL, but p->cnt could be 0 */
} }
__bpf_kfunc void bpf_kfunc_call_test_destructive(void) __bpf_kfunc void bpf_kfunc_call_test_destructive(void)
...@@ -784,7 +785,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3) ...@@ -784,7 +785,7 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_test_fail3)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_pass1)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail1)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2) BTF_ID_FLAGS(func, bpf_kfunc_call_test_mem_len_fail2)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_kfunc_call_test_ref, KF_TRUSTED_ARGS | KF_RCU)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_destructive, KF_DESTRUCTIVE)
BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg) BTF_ID_FLAGS(func, bpf_kfunc_call_test_static_unused_arg)
BTF_SET8_END(test_sk_check_kfunc_ids) BTF_SET8_END(test_sk_check_kfunc_ids)
......
...@@ -205,7 +205,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path) ...@@ -205,7 +205,7 @@ int BPF_PROG(cgrp_kfunc_get_unreleased, struct cgroup *cgrp, const char *path)
} }
SEC("tp_btf/cgroup_mkdir") SEC("tp_btf/cgroup_mkdir")
__failure __msg("arg#0 is untrusted_ptr_or_null_ expected ptr_ or socket") __failure __msg("expects refcounted")
int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path) int BPF_PROG(cgrp_kfunc_release_untrusted, struct cgroup *cgrp, const char *path)
{ {
struct __cgrps_kfunc_map_value *v; struct __cgrps_kfunc_map_value *v;
......
...@@ -281,7 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx) ...@@ -281,7 +281,7 @@ int reject_kptr_get_bad_type_match(struct __sk_buff *ctx)
} }
SEC("?tc") SEC("?tc")
__failure __msg("R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_") __failure __msg("R1 type=rcu_ptr_or_null_ expected=percpu_ptr_")
int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx) int mark_ref_as_untrusted_or_null(struct __sk_buff *ctx)
{ {
struct map_value *v; struct map_value *v;
...@@ -316,7 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx) ...@@ -316,7 +316,7 @@ int reject_untrusted_store_to_ref(struct __sk_buff *ctx)
} }
SEC("?tc") SEC("?tc")
__failure __msg("R2 type=untrusted_ptr_ expected=ptr_") __failure __msg("R2 must be referenced")
int reject_untrusted_xchg(struct __sk_buff *ctx) int reject_untrusted_xchg(struct __sk_buff *ctx)
{ {
struct prog_test_ref_kfunc *p; struct prog_test_ref_kfunc *p;
......
...@@ -243,7 +243,7 @@ ...@@ -243,7 +243,7 @@
}, },
.result_unpriv = REJECT, .result_unpriv = REJECT,
.result = REJECT, .result = REJECT,
.errstr = "R1 must be referenced", .errstr = "R1 must be",
}, },
{ {
"calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID", "calls: valid kfunc call: referenced arg needs refcounted PTR_TO_BTF_ID",
......
...@@ -336,7 +336,7 @@ ...@@ -336,7 +336,7 @@
.prog_type = BPF_PROG_TYPE_SCHED_CLS, .prog_type = BPF_PROG_TYPE_SCHED_CLS,
.fixup_map_kptr = { 1 }, .fixup_map_kptr = { 1 },
.result = REJECT, .result = REJECT,
.errstr = "R1 type=untrusted_ptr_or_null_ expected=percpu_ptr_", .errstr = "R1 type=rcu_ptr_or_null_ expected=percpu_ptr_",
}, },
{ {
"map_kptr: ref: reject off != 0", "map_kptr: ref: reject off != 0",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment