Commit 519e1de9 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'add-internal-only-bpf-per-cpu-instruction'

Andrii Nakryiko says:

====================
Add internal-only BPF per-CPU instruction

Add a new BPF instruction for resolving per-CPU memory addresses.

New instruction is a special form of BPF_ALU64 | BPF_MOV | BPF_X, with
insns->off set to BPF_ADDR_PERCPU (== -1). It resolves provided per-CPU offset
to an absolute address where per-CPU data resides for "this" CPU.

This patch set implements support for it in x86-64 BPF JIT only.

Using the new instruction, we also implement inlining for three cases:
  - bpf_get_smp_processor_id(), which allows to avoid unnecessary trivial
    function call, saving a bit of performance and also not polluting LBR
    records with unnecessary function call/return records;
  - PERCPU_ARRAY's bpf_map_lookup_elem() is completely inlined, bringing its
    performance to implementing per-CPU data structures using global variables
    in BPF (which is an awesome improvement, see benchmarks below);
  - PERCPU_HASH's bpf_map_lookup_elem() is partially inlined, just like the
    same for non-PERCPU HASH map; this still saves a bit of overhead.

To validate performance benefits, I hacked together a tiny benchmark doing
only bpf_map_lookup_elem() and incrementing the value by 1 for PERCPU_ARRAY
(arr-inc benchmark below) and PERCPU_HASH (hash-inc benchmark below) maps. To
establish a baseline, I also implemented logic similar to PERCPU_ARRAY based
on global variable array using bpf_get_smp_processor_id() to index array for
current CPU (glob-arr-inc benchmark below).

BEFORE
======
glob-arr-inc   :  163.685 ± 0.092M/s
arr-inc        :  138.096 ± 0.160M/s
hash-inc       :   66.855 ± 0.123M/s

AFTER
=====
glob-arr-inc   :  173.921 ± 0.039M/s (+6%)
arr-inc        :  170.729 ± 0.210M/s (+23.7%)
hash-inc       :   68.673 ± 0.070M/s (+2.7%)

As can be seen, PERCPU_HASH gets a modest +2.7% improvement, while global
array-based gets a nice +6% due to inlining of bpf_get_smp_processor_id().

But what's really important is that arr-inc benchmark basically catches up
with glob-arr-inc, resulting in +23.7% improvement. This means that in
practice it won't be necessary to avoid PERCPU_ARRAY anymore if performance is
critical (e.g., high-frequent stats collection, which is often a practical use
for PERCPU_ARRAY today).

v1->v2:
  - use BPF_ALU64 | BPF_MOV instruction instead of LDX (Alexei);
  - dropped the direct per-CPU memory read instruction, it can always be added
    back, if necessary;
  - guarded bpf_get_smp_processor_id() behind x86-64 check (Alexei);
  - switched all per-cpu addr casts to (unsigned long) to avoid sparse
    warnings.
====================

Link: https://lore.kernel.org/r/20240402021307.1012571-1-andrii@kernel.orgSigned-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 2e114248 0b56e637
...@@ -1382,6 +1382,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image ...@@ -1382,6 +1382,17 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image
maybe_emit_mod(&prog, AUX_REG, dst_reg, true); maybe_emit_mod(&prog, AUX_REG, dst_reg, true);
EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg)); EMIT3(0x0F, 0x44, add_2reg(0xC0, AUX_REG, dst_reg));
break; break;
} else if (insn_is_mov_percpu_addr(insn)) {
u32 off = (u32)(unsigned long)&this_cpu_off;
/* mov <dst>, <src> (if necessary) */
EMIT_mov(dst_reg, src_reg);
/* add <dst>, gs:[<off>] */
EMIT2(0x65, add_1mod(0x48, dst_reg));
EMIT3(0x03, add_1reg(0x04, dst_reg), 0x25);
EMIT(off, 4);
break;
} }
fallthrough; fallthrough;
case BPF_ALU | BPF_MOV | BPF_X: case BPF_ALU | BPF_MOV | BPF_X:
...@@ -3365,6 +3376,11 @@ bool bpf_jit_supports_subprog_tailcalls(void) ...@@ -3365,6 +3376,11 @@ bool bpf_jit_supports_subprog_tailcalls(void)
return true; return true;
} }
bool bpf_jit_supports_percpu_insn(void)
{
return true;
}
void bpf_jit_free(struct bpf_prog *prog) void bpf_jit_free(struct bpf_prog *prog)
{ {
if (prog->jited) { if (prog->jited) {
......
...@@ -178,6 +178,25 @@ struct ctl_table_header; ...@@ -178,6 +178,25 @@ struct ctl_table_header;
.off = 0, \ .off = 0, \
.imm = 0 }) .imm = 0 })
/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
* dst_reg = src_reg + <percpu_base_off>
* BPF_ADDR_PERCPU is used as a special insn->off value.
*/
#define BPF_ADDR_PERCPU (-1)
#define BPF_MOV64_PERCPU_REG(DST, SRC) \
((struct bpf_insn) { \
.code = BPF_ALU64 | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = BPF_ADDR_PERCPU, \
.imm = 0 })
static inline bool insn_is_mov_percpu_addr(const struct bpf_insn *insn)
{
return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}
/* Short form of mov, dst_reg = imm32 */ /* Short form of mov, dst_reg = imm32 */
#define BPF_MOV64_IMM(DST, IMM) \ #define BPF_MOV64_IMM(DST, IMM) \
...@@ -972,6 +991,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); ...@@ -972,6 +991,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void); bool bpf_jit_needs_zext(void);
bool bpf_jit_supports_subprog_tailcalls(void); bool bpf_jit_supports_subprog_tailcalls(void);
bool bpf_jit_supports_percpu_insn(void);
bool bpf_jit_supports_kfunc_call(void); bool bpf_jit_supports_kfunc_call(void);
bool bpf_jit_supports_far_kfunc_call(void); bool bpf_jit_supports_far_kfunc_call(void);
bool bpf_jit_supports_exceptions(void); bool bpf_jit_supports_exceptions(void);
......
...@@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -246,6 +246,38 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
return this_cpu_ptr(array->pptrs[index & array->index_mask]); return this_cpu_ptr(array->pptrs[index & array->index_mask]);
} }
/* emit BPF instructions equivalent to C code of percpu_array_map_lookup_elem() */
static int percpu_array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
if (!bpf_jit_supports_percpu_insn())
return -EOPNOTSUPP;
if (map->map_flags & BPF_F_INNER_MAP)
return -EOPNOTSUPP;
BUILD_BUG_ON(offsetof(struct bpf_array, map) != 0);
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, offsetof(struct bpf_array, pptrs));
*insn++ = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 0);
if (!map->bypass_spec_v1) {
*insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 6);
*insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_0, array->index_mask);
} else {
*insn++ = BPF_JMP_IMM(BPF_JGE, BPF_REG_0, map->max_entries, 5);
}
*insn++ = BPF_ALU64_IMM(BPF_LSH, BPF_REG_0, 3);
*insn++ = BPF_ALU64_REG(BPF_ADD, BPF_REG_0, BPF_REG_1);
*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
*insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1);
*insn++ = BPF_MOV64_IMM(BPF_REG_0, 0);
return insn - insn_buf;
}
static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) static void *percpu_array_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{ {
struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_array *array = container_of(map, struct bpf_array, map);
...@@ -776,6 +808,7 @@ const struct bpf_map_ops percpu_array_map_ops = { ...@@ -776,6 +808,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
.map_free = array_map_free, .map_free = array_map_free,
.map_get_next_key = array_map_get_next_key, .map_get_next_key = array_map_get_next_key,
.map_lookup_elem = percpu_array_map_lookup_elem, .map_lookup_elem = percpu_array_map_lookup_elem,
.map_gen_lookup = percpu_array_map_gen_lookup,
.map_update_elem = array_map_update_elem, .map_update_elem = array_map_update_elem,
.map_delete_elem = array_map_delete_elem, .map_delete_elem = array_map_delete_elem,
.map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem, .map_lookup_percpu_elem = percpu_array_map_lookup_percpu_elem,
......
...@@ -2945,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void) ...@@ -2945,6 +2945,11 @@ bool __weak bpf_jit_supports_subprog_tailcalls(void)
return false; return false;
} }
bool __weak bpf_jit_supports_percpu_insn(void)
{
return false;
}
bool __weak bpf_jit_supports_kfunc_call(void) bool __weak bpf_jit_supports_kfunc_call(void)
{ {
return false; return false;
......
...@@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn) ...@@ -172,6 +172,17 @@ static bool is_addr_space_cast(const struct bpf_insn *insn)
insn->off == BPF_ADDR_SPACE_CAST; insn->off == BPF_ADDR_SPACE_CAST;
} }
/* Special (internal-only) form of mov, used to resolve per-CPU addrs:
* dst_reg = src_reg + <percpu_base_off>
* BPF_ADDR_PERCPU is used as a special insn->off value.
*/
#define BPF_ADDR_PERCPU (-1)
static inline bool is_mov_percpu_addr(const struct bpf_insn *insn)
{
return insn->code == (BPF_ALU64 | BPF_MOV | BPF_X) && insn->off == BPF_ADDR_PERCPU;
}
void print_bpf_insn(const struct bpf_insn_cbs *cbs, void print_bpf_insn(const struct bpf_insn_cbs *cbs,
const struct bpf_insn *insn, const struct bpf_insn *insn,
bool allow_ptr_leaks) bool allow_ptr_leaks)
...@@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, ...@@ -194,6 +205,9 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs,
verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n", verbose(cbs->private_data, "(%02x) r%d = addr_space_cast(r%d, %d, %d)\n",
insn->code, insn->dst_reg, insn->code, insn->dst_reg,
insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm); insn->src_reg, ((u32)insn->imm) >> 16, (u16)insn->imm);
} else if (is_mov_percpu_addr(insn)) {
verbose(cbs->private_data, "(%02x) r%d = &(void __percpu *)(r%d)\n",
insn->code, insn->dst_reg, insn->src_reg);
} else if (BPF_SRC(insn->code) == BPF_X) { } else if (BPF_SRC(insn->code) == BPF_X) {
verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n", verbose(cbs->private_data, "(%02x) %c%d %s %s%c%d\n",
insn->code, class == BPF_ALU ? 'w' : 'r', insn->code, class == BPF_ALU ? 'w' : 'r',
......
...@@ -2308,6 +2308,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -2308,6 +2308,26 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL; return NULL;
} }
/* inline bpf_map_lookup_elem() call for per-CPU hashmap */
static int htab_percpu_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_insn *insn = insn_buf;
if (!bpf_jit_supports_percpu_insn())
return -EOPNOTSUPP;
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3);
*insn++ = BPF_ALU64_IMM(BPF_ADD, BPF_REG_0,
offsetof(struct htab_elem, key) + map->key_size);
*insn++ = BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_0, 0);
*insn++ = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
return insn - insn_buf;
}
static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu) static void *htab_percpu_map_lookup_percpu_elem(struct bpf_map *map, void *key, u32 cpu)
{ {
struct htab_elem *l; struct htab_elem *l;
...@@ -2436,6 +2456,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { ...@@ -2436,6 +2456,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
.map_free = htab_map_free, .map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key, .map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_percpu_map_lookup_elem, .map_lookup_elem = htab_percpu_map_lookup_elem,
.map_gen_lookup = htab_percpu_map_gen_lookup,
.map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem, .map_lookup_and_delete_elem = htab_percpu_map_lookup_and_delete_elem,
.map_update_elem = htab_percpu_map_update_elem, .map_update_elem = htab_percpu_map_update_elem,
.map_delete_elem = htab_map_delete_elem, .map_delete_elem = htab_map_delete_elem,
......
...@@ -20074,6 +20074,30 @@ static int do_misc_fixups(struct bpf_verifier_env *env) ...@@ -20074,6 +20074,30 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto next_insn; goto next_insn;
} }
#ifdef CONFIG_X86_64
/* Implement bpf_get_smp_processor_id() inline. */
if (insn->imm == BPF_FUNC_get_smp_processor_id &&
prog->jit_requested && bpf_jit_supports_percpu_insn()) {
/* BPF_FUNC_get_smp_processor_id inlining is an
* optimization, so if pcpu_hot.cpu_number is ever
* changed in some incompatible and hard to support
* way, it's fine to back out this inlining logic
*/
insn_buf[0] = BPF_MOV32_IMM(BPF_REG_0, (u32)(unsigned long)&pcpu_hot.cpu_number);
insn_buf[1] = BPF_MOV64_PERCPU_REG(BPF_REG_0, BPF_REG_0);
insn_buf[2] = BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 0);
cnt = 3;
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
delta += cnt - 1;
env->prog = prog = new_prog;
insn = new_prog->insnsi + i + delta;
goto next_insn;
}
#endif
/* Implement bpf_get_func_arg inline. */ /* Implement bpf_get_func_arg inline. */
if (prog_type == BPF_PROG_TYPE_TRACING && if (prog_type == BPF_PROG_TYPE_TRACING &&
insn->imm == BPF_FUNC_get_func_arg) { insn->imm == BPF_FUNC_get_func_arg) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment