Commit d8eca5bb authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov

bpf: implement lookup-free direct value access for maps

This generic extension to BPF maps allows for directly loading
an address residing inside a BPF map value as a single BPF
ldimm64 instruction!

The idea is similar to what BPF_PSEUDO_MAP_FD does today, which
is a special src_reg flag for ldimm64 instruction that indicates
that inside the first part of the double insns's imm field is a
file descriptor which the verifier then replaces as a full 64bit
address of the map into both imm parts. For the newly added
BPF_PSEUDO_MAP_VALUE src_reg flag, the idea is the following:
the first part of the double insns's imm field is again a file
descriptor corresponding to the map, and the second part of the
imm field is an offset into the value. The verifier will then
replace both imm parts with an address that points into the BPF
map value at the given value offset for maps that support this
operation. Currently supported is array map with single entry.
It is possible to support more than just single map element by
reusing both 16bit off fields of the insns as a map index, so
full array map lookup could be expressed that way. It hasn't
been implemented here due to lack of concrete use case, but
could easily be done so in future in a compatible way, since
both off fields right now have to be 0 and would correctly
denote a map index 0.

The BPF_PSEUDO_MAP_VALUE is a distinct flag as otherwise with
BPF_PSEUDO_MAP_FD we could not differ offset 0 between load of
map pointer versus load of map's value at offset 0, and changing
BPF_PSEUDO_MAP_FD's encoding into off by one to differ between
regular map pointer and map value pointer would add unnecessary
complexity and increases barrier for debugability thus less
suitable. Using the second part of the imm field as an offset
into the value does /not/ come with limitations since maximum
possible value size is in u32 universe anyway.

This optimization allows for efficiently retrieving an address
to a map value memory area without having to issue a helper call
which needs to prepare registers according to calling convention,
etc, without needing the extra NULL test, and without having to
add the offset in an additional instruction to the value base
pointer. The verifier then treats the destination register as
PTR_TO_MAP_VALUE with constant reg->off from the user passed
offset from the second imm field, and guarantees that this is
within bounds of the map value. Any subsequent operations are
normally treated as typical map value handling without anything
extra needed from verification side.

The two map operations for direct value access have been added to
array map for now. In future other types could be supported as
well depending on the use case. The main use case for this commit
is to allow for BPF loader support for global variables that
reside in .data/.rodata/.bss sections such that we can directly
load the address of them with minimal additional infrastructure
required. Loader support has been added in subsequent commits for
libbpf library.
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent ff466b58
...@@ -57,6 +57,12 @@ struct bpf_map_ops { ...@@ -57,6 +57,12 @@ struct bpf_map_ops {
const struct btf *btf, const struct btf *btf,
const struct btf_type *key_type, const struct btf_type *key_type,
const struct btf_type *value_type); const struct btf_type *value_type);
/* Direct value access helpers. */
int (*map_direct_value_addr)(const struct bpf_map *map,
u64 *imm, u32 off);
int (*map_direct_value_meta)(const struct bpf_map *map,
u64 imm, u32 *off);
}; };
struct bpf_map { struct bpf_map {
......
...@@ -224,6 +224,10 @@ struct bpf_insn_aux_data { ...@@ -224,6 +224,10 @@ struct bpf_insn_aux_data {
unsigned long map_state; /* pointer/poison value for maps */ unsigned long map_state; /* pointer/poison value for maps */
s32 call_imm; /* saved imm field of call insn */ s32 call_imm; /* saved imm field of call insn */
u32 alu_limit; /* limit for add/sub register with pointer */ u32 alu_limit; /* limit for add/sub register with pointer */
struct {
u32 map_index; /* index into used_maps[] */
u32 map_off; /* offset from value base address */
};
}; };
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
int sanitize_stack_off; /* stack slot to be cleared */ int sanitize_stack_off; /* stack slot to be cleared */
......
...@@ -255,8 +255,19 @@ enum bpf_attach_type { ...@@ -255,8 +255,19 @@ enum bpf_attach_type {
*/ */
#define BPF_F_ANY_ALIGNMENT (1U << 1) #define BPF_F_ANY_ALIGNMENT (1U << 1)
/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ /* When BPF ldimm64's insn[0].src_reg != 0 then this can have
* two extensions:
*
* insn[0].src_reg: BPF_PSEUDO_MAP_FD BPF_PSEUDO_MAP_VALUE
* insn[0].imm: map fd map fd
* insn[1].imm: 0 offset into value
* insn[0].off: 0 0
* insn[1].off: 0 0
* ldimm64 rewrite: address of map address of map[0]+offset
* verifier type: CONST_PTR_TO_MAP PTR_TO_MAP_VALUE
*/
#define BPF_PSEUDO_MAP_FD 1 #define BPF_PSEUDO_MAP_FD 1
#define BPF_PSEUDO_MAP_VALUE 2
/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
* offset to another bpf function * offset to another bpf function
......
...@@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -160,6 +160,36 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
return array->value + array->elem_size * (index & array->index_mask); return array->value + array->elem_size * (index & array->index_mask);
} }
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
u32 off)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
if (map->max_entries != 1)
return -ENOTSUPP;
if (off >= map->value_size)
return -EINVAL;
*imm = (unsigned long)array->value;
return 0;
}
static int array_map_direct_value_meta(const struct bpf_map *map, u64 imm,
u32 *off)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
u64 base = (unsigned long)array->value;
u64 range = array->elem_size;
if (map->max_entries != 1)
return -ENOTSUPP;
if (imm < base || imm >= base + range)
return -ENOENT;
*off = imm - base;
return 0;
}
/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */
static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{ {
...@@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = { ...@@ -419,6 +449,8 @@ const struct bpf_map_ops array_map_ops = {
.map_update_elem = array_map_update_elem, .map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem, .map_delete_elem = array_map_delete_elem,
.map_gen_lookup = array_map_gen_lookup, .map_gen_lookup = array_map_gen_lookup,
.map_direct_value_addr = array_map_direct_value_addr,
.map_direct_value_meta = array_map_direct_value_meta,
.map_seq_show_elem = array_map_seq_show_elem, .map_seq_show_elem = array_map_seq_show_elem,
.map_check_btf = array_map_check_btf, .map_check_btf = array_map_check_btf,
}; };
......
...@@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) ...@@ -292,7 +292,8 @@ int bpf_prog_calc_tag(struct bpf_prog *fp)
dst[i] = fp->insnsi[i]; dst[i] = fp->insnsi[i];
if (!was_ld_map && if (!was_ld_map &&
dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) && dst[i].code == (BPF_LD | BPF_IMM | BPF_DW) &&
dst[i].src_reg == BPF_PSEUDO_MAP_FD) { (dst[i].src_reg == BPF_PSEUDO_MAP_FD ||
dst[i].src_reg == BPF_PSEUDO_MAP_VALUE)) {
was_ld_map = true; was_ld_map = true;
dst[i].imm = 0; dst[i].imm = 0;
} else if (was_ld_map && } else if (was_ld_map &&
......
...@@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, ...@@ -205,10 +205,11 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
* part of the ldimm64 insn is accessible. * part of the ldimm64 insn is accessible.
*/ */
u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; bool is_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD ||
insn->src_reg == BPF_PSEUDO_MAP_VALUE;
char tmp[64]; char tmp[64];
if (map_ptr && !allow_ptr_leaks) if (is_ptr && !allow_ptr_leaks)
imm = 0; imm = 0;
verbose(cbs->private_data, "(%02x) r%d = %s\n", verbose(cbs->private_data, "(%02x) r%d = %s\n",
......
...@@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) ...@@ -2072,13 +2072,26 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
} }
static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog,
unsigned long addr) unsigned long addr, u32 *off,
u32 *type)
{ {
const struct bpf_map *map;
int i; int i;
for (i = 0; i < prog->aux->used_map_cnt; i++) for (i = 0, *off = 0; i < prog->aux->used_map_cnt; i++) {
if (prog->aux->used_maps[i] == (void *)addr) map = prog->aux->used_maps[i];
return prog->aux->used_maps[i]; if (map == (void *)addr) {
*type = BPF_PSEUDO_MAP_FD;
return map;
}
if (!map->ops->map_direct_value_meta)
continue;
if (!map->ops->map_direct_value_meta(map, addr, off)) {
*type = BPF_PSEUDO_MAP_VALUE;
return map;
}
}
return NULL; return NULL;
} }
...@@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) ...@@ -2086,6 +2099,7 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
{ {
const struct bpf_map *map; const struct bpf_map *map;
struct bpf_insn *insns; struct bpf_insn *insns;
u32 off, type;
u64 imm; u64 imm;
int i; int i;
...@@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) ...@@ -2113,11 +2127,11 @@ static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog)
continue; continue;
imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm;
map = bpf_map_from_imm(prog, imm); map = bpf_map_from_imm(prog, imm, &off, &type);
if (map) { if (map) {
insns[i].src_reg = BPF_PSEUDO_MAP_FD; insns[i].src_reg = type;
insns[i].imm = map->id; insns[i].imm = map->id;
insns[i + 1].imm = 0; insns[i + 1].imm = off;
continue; continue;
} }
} }
......
...@@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, ...@@ -5056,18 +5056,12 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
return 0; return 0;
} }
/* return the map pointer stored inside BPF_LD_IMM64 instruction */
static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
{
u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
return (struct bpf_map *) (unsigned long) imm64;
}
/* verify BPF_LD_IMM64 instruction */ /* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{ {
struct bpf_insn_aux_data *aux = cur_aux(env);
struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *regs = cur_regs(env);
struct bpf_map *map;
int err; int err;
if (BPF_SIZE(insn->code) != BPF_DW) { if (BPF_SIZE(insn->code) != BPF_DW) {
...@@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) ...@@ -5091,11 +5085,22 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0; return 0;
} }
/* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */ map = env->used_maps[aux->map_index];
BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD); mark_reg_known_zero(env, regs, insn->dst_reg);
regs[insn->dst_reg].map_ptr = map;
if (insn->src_reg == BPF_PSEUDO_MAP_VALUE) {
regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
regs[insn->dst_reg].off = aux->map_off;
if (map_value_has_spin_lock(map))
regs[insn->dst_reg].id = ++env->id_gen;
} else if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
regs[insn->dst_reg].type = CONST_PTR_TO_MAP; regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn); } else {
verbose(env, "bpf verifier is misconfigured\n");
return -EINVAL;
}
return 0; return 0;
} }
...@@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) ...@@ -6803,8 +6808,10 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
} }
if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
struct bpf_insn_aux_data *aux;
struct bpf_map *map; struct bpf_map *map;
struct fd f; struct fd f;
u64 addr;
if (i == insn_cnt - 1 || insn[1].code != 0 || if (i == insn_cnt - 1 || insn[1].code != 0 ||
insn[1].dst_reg != 0 || insn[1].src_reg != 0 || insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
...@@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) ...@@ -6813,13 +6820,19 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return -EINVAL; return -EINVAL;
} }
if (insn->src_reg == 0) if (insn[0].src_reg == 0)
/* valid generic load 64-bit imm */ /* valid generic load 64-bit imm */
goto next_insn; goto next_insn;
if (insn[0].src_reg != BPF_PSEUDO_MAP_FD || /* In final convert_pseudo_ld_imm64() step, this is
insn[1].imm != 0) { * converted into regular 64-bit imm load insn.
verbose(env, "unrecognized bpf_ld_imm64 insn\n"); */
if ((insn[0].src_reg != BPF_PSEUDO_MAP_FD &&
insn[0].src_reg != BPF_PSEUDO_MAP_VALUE) ||
(insn[0].src_reg == BPF_PSEUDO_MAP_FD &&
insn[1].imm != 0)) {
verbose(env,
"unrecognized bpf_ld_imm64 insn\n");
return -EINVAL; return -EINVAL;
} }
...@@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) ...@@ -6837,16 +6850,47 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
return err; return err;
} }
/* store map pointer inside BPF_LD_IMM64 instruction */ aux = &env->insn_aux_data[i];
insn[0].imm = (u32) (unsigned long) map; if (insn->src_reg == BPF_PSEUDO_MAP_FD) {
insn[1].imm = ((u64) (unsigned long) map) >> 32; addr = (unsigned long)map;
} else {
u32 off = insn[1].imm;
if (off >= BPF_MAX_VAR_OFF) {
verbose(env, "direct value offset of %u is not allowed\n", off);
fdput(f);
return -EINVAL;
}
if (!map->ops->map_direct_value_addr) {
verbose(env, "no direct value access support for this map type\n");
fdput(f);
return -EINVAL;
}
err = map->ops->map_direct_value_addr(map, &addr, off);
if (err) {
verbose(env, "invalid access to map value pointer, value_size=%u off=%u\n",
map->value_size, off);
fdput(f);
return err;
}
aux->map_off = off;
addr += off;
}
insn[0].imm = (u32)addr;
insn[1].imm = addr >> 32;
/* check whether we recorded this map already */ /* check whether we recorded this map already */
for (j = 0; j < env->used_map_cnt; j++) for (j = 0; j < env->used_map_cnt; j++) {
if (env->used_maps[j] == map) { if (env->used_maps[j] == map) {
aux->map_index = j;
fdput(f); fdput(f);
goto next_insn; goto next_insn;
} }
}
if (env->used_map_cnt >= MAX_USED_MAPS) { if (env->used_map_cnt >= MAX_USED_MAPS) {
fdput(f); fdput(f);
...@@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) ...@@ -6863,6 +6907,8 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
fdput(f); fdput(f);
return PTR_ERR(map); return PTR_ERR(map);
} }
aux->map_index = env->used_map_cnt;
env->used_maps[env->used_map_cnt++] = map; env->used_maps[env->used_map_cnt++] = map;
if (bpf_map_is_cgroup_storage(map) && if (bpf_map_is_cgroup_storage(map) &&
......
...@@ -195,6 +195,9 @@ static const char *print_imm(void *private_data, ...@@ -195,6 +195,9 @@ static const char *print_imm(void *private_data,
if (insn->src_reg == BPF_PSEUDO_MAP_FD) if (insn->src_reg == BPF_PSEUDO_MAP_FD)
snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
"map[id:%u]", insn->imm); "map[id:%u]", insn->imm);
else if (insn->src_reg == BPF_PSEUDO_MAP_VALUE)
snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
"map[id:%u][0]+%u", insn->imm, (insn + 1)->imm);
else else
snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), snprintf(dd->scratch_buff, sizeof(dd->scratch_buff),
"0x%llx", (unsigned long long)full_imm); "0x%llx", (unsigned long long)full_imm);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment