Commit 520fad2e authored by Andrii Nakryiko's avatar Andrii Nakryiko Committed by Alexei Starovoitov

selftests/bpf: scale benchmark counting by using per-CPU counters

When benchmarking with multiple threads (-pN, where N>1), we start
contending on single atomic counter that both BPF trigger benchmarks are
using, as well as "baseline" tests in user space (trig-base and
trig-uprobe-base benchmarks). As such, we start bottlenecking on
something completely irrelevant to benchmark at hand.

Scale counting up by using per-CPU counters on BPF side. On use space
side we do the next best thing: hash thread ID to approximate per-CPU
behavior. It seems to work quite well in practice.

To demonstrate the difference, I ran three benchmarks with 1, 2, 4, 8,
16, and 32 threads:
  - trig-uprobe-base (no syscalls, pure tight counting loop in user-space);
  - trig-base (get_pgid() syscall, atomic counter in user-space);
  - trig-fentry (syscall to trigger fentry program, atomic uncontended per-CPU
    counter on BPF side).

Command used:

  for b in uprobe-base base fentry; do \
    for p in 1 2 4 8 16 32; do \
      printf "%-11s %2d: %s\n" $b $p \
        "$(sudo ./bench -w2 -d5 -a -p$p trig-$b | tail -n1 | cut -d'(' -f1 | cut -d' ' -f3-)"; \
    done; \
  done

Before these changes, aggregate throughput across all threads doesn't
scale well with number of threads, it actually even falls sharply for
uprobe-base due to a very high contention:

  uprobe-base  1:  138.998 ± 0.650M/s
  uprobe-base  2:   70.526 ± 1.147M/s
  uprobe-base  4:   63.114 ± 0.302M/s
  uprobe-base  8:   54.177 ± 0.138M/s
  uprobe-base 16:   45.439 ± 0.057M/s
  uprobe-base 32:   37.163 ± 0.242M/s
  base         1:   16.940 ± 0.182M/s
  base         2:   19.231 ± 0.105M/s
  base         4:   21.479 ± 0.038M/s
  base         8:   23.030 ± 0.037M/s
  base        16:   22.034 ± 0.004M/s
  base        32:   18.152 ± 0.013M/s
  fentry       1:   14.794 ± 0.054M/s
  fentry       2:   17.341 ± 0.055M/s
  fentry       4:   23.792 ± 0.024M/s
  fentry       8:   21.557 ± 0.047M/s
  fentry      16:   21.121 ± 0.004M/s
  fentry      32:   17.067 ± 0.023M/s

After these changes, we see almost perfect linear scaling, as expected.
The sub-linear scaling when going from 8 to 16 threads is interesting
and consistent on my test machine, but I haven't investigated what is
causing it this peculiar slowdown (across all benchmarks, could be due
to hyperthreading effects, not sure).

  uprobe-base  1:  139.980 ± 0.648M/s
  uprobe-base  2:  270.244 ± 0.379M/s
  uprobe-base  4:  532.044 ± 1.519M/s
  uprobe-base  8: 1004.571 ± 3.174M/s
  uprobe-base 16: 1720.098 ± 0.744M/s
  uprobe-base 32: 3506.659 ± 8.549M/s
  base         1:   16.869 ± 0.071M/s
  base         2:   33.007 ± 0.092M/s
  base         4:   64.670 ± 0.203M/s
  base         8:  121.969 ± 0.210M/s
  base        16:  207.832 ± 0.112M/s
  base        32:  424.227 ± 1.477M/s
  fentry       1:   14.777 ± 0.087M/s
  fentry       2:   28.575 ± 0.146M/s
  fentry       4:   56.234 ± 0.176M/s
  fentry       8:  106.095 ± 0.385M/s
  fentry      16:  181.440 ± 0.032M/s
  fentry      32:  369.131 ± 0.693M/s
Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
Message-ID: <20240315213329.1161589-1-andrii@kernel.org>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parent e9a826dd
// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2020 Facebook */
#define _GNU_SOURCE
#include <unistd.h>
#include "bench.h"
#include "trigger_bench.skel.h"
#include "trace_helpers.h"
/* adjust slot shift in inc_hits() if changing */
#define MAX_BUCKETS 256
/* BPF triggering benchmarks */
static struct trigger_ctx {
struct trigger_bench *skel;
} ctx;
static struct counter base_hits;
static struct counter base_hits[MAX_BUCKETS];
static __always_inline void inc_counter(struct counter *counters)
{
static __thread int tid = 0;
unsigned slot;
if (unlikely(tid == 0))
tid = gettid();
/* multiplicative hashing, it's fast */
slot = 2654435769U * tid;
slot >>= 24;
atomic_inc(&base_hits[slot].value); /* use highest byte as an index */
}
static long sum_and_reset_counters(struct counter *counters)
{
int i;
long sum = 0;
for (i = 0; i < MAX_BUCKETS; i++)
sum += atomic_swap(&counters[i].value, 0);
return sum;
}
static void trigger_validate(void)
{
......@@ -23,14 +53,14 @@ static void *trigger_base_producer(void *input)
{
while (true) {
(void)syscall(__NR_getpgid);
atomic_inc(&base_hits.value);
inc_counter(base_hits);
}
return NULL;
}
static void trigger_base_measure(struct bench_res *res)
{
res->hits = atomic_swap(&base_hits.value, 0);
res->hits = sum_and_reset_counters(base_hits);
}
static void *trigger_producer(void *input)
......@@ -42,7 +72,7 @@ static void *trigger_producer(void *input)
static void trigger_measure(struct bench_res *res)
{
res->hits = atomic_swap(&ctx.skel->bss->hits, 0);
res->hits = sum_and_reset_counters(ctx.skel->bss->hits);
}
static void setup_ctx(void)
......@@ -164,7 +194,7 @@ static void *uprobe_base_producer(void *input)
{
while (true) {
uprobe_target_nop();
atomic_inc(&base_hits.value);
inc_counter(base_hits);
}
return NULL;
}
......
......@@ -9,12 +9,27 @@
char _license[] SEC("license") = "GPL";
long hits = 0;
#define CPU_MASK 255
#define MAX_CPUS (CPU_MASK + 1) /* should match MAX_BUCKETS in benchs/bench_trigger.c */
/* matches struct counter in bench.h */
struct counter {
long value;
} __attribute__((aligned(128)));
struct counter hits[MAX_CPUS];
static __always_inline void inc_counter(void)
{
int cpu = bpf_get_smp_processor_id();
__sync_add_and_fetch(&hits[cpu & CPU_MASK].value, 1);
}
SEC("tp/syscalls/sys_enter_getpgid")
int bench_trigger_tp(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
......@@ -22,69 +37,69 @@ SEC("raw_tp/sys_enter")
int BPF_PROG(bench_trigger_raw_tp, struct pt_regs *regs, long id)
{
if (id == __NR_getpgid)
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("kprobe/" SYS_PREFIX "sys_getpgid")
int bench_trigger_kprobe(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("kretprobe/" SYS_PREFIX "sys_getpgid")
int bench_trigger_kretprobe(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("kprobe.multi/" SYS_PREFIX "sys_getpgid")
int bench_trigger_kprobe_multi(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("kretprobe.multi/" SYS_PREFIX "sys_getpgid")
int bench_trigger_kretprobe_multi(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("fentry/" SYS_PREFIX "sys_getpgid")
int bench_trigger_fentry(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("fexit/" SYS_PREFIX "sys_getpgid")
int bench_trigger_fexit(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("fentry.s/" SYS_PREFIX "sys_getpgid")
int bench_trigger_fentry_sleep(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
SEC("fmod_ret/" SYS_PREFIX "sys_getpgid")
int bench_trigger_fmodret(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return -22;
}
SEC("uprobe")
int bench_trigger_uprobe(void *ctx)
{
__sync_add_and_fetch(&hits, 1);
inc_counter();
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment