Commit 8d275960 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage'

Martin KaFai Lau says:

====================

From: Martin KaFai Lau <martin.lau@kernel.org>

This set is a continuation of the effort in using
bpf_mem_cache_alloc/free in bpf_local_storage [1]

Major change is only using bpf_mem_alloc for task and cgrp storage
while sk and inode stay with kzalloc/kfree. The details is
in patch 2.

[1]: https://lore.kernel.org/bpf/20230308065936.1550103-1-martin.lau@linux.dev/

v3:
- Only use bpf_mem_alloc for task and cgrp storage.
- sk and inode storage stay with kzalloc/kfree.
- Check NULL and add comments in bpf_mem_cache_raw_free() in patch 1.
- Added test and benchmark for task storage.

v2:
- Added bpf_mem_cache_alloc_flags() and bpf_mem_cache_raw_free()
  to hide the internal data structure of the bpf allocator.
- Fixed a typo bug in bpf_selem_free()
- Simplified the test_local_storage test by directly using
  err returned from libbpf
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents e9936076 cbe9d93d
......@@ -13,6 +13,7 @@
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/types.h>
#include <linux/bpf_mem_alloc.h>
#include <uapi/linux/btf.h>
#define BPF_LOCAL_STORAGE_CACHE_SIZE 16
......@@ -55,6 +56,9 @@ struct bpf_local_storage_map {
u32 bucket_log;
u16 elem_size;
u16 cache_idx;
struct bpf_mem_alloc selem_ma;
struct bpf_mem_alloc storage_ma;
bool bpf_ma;
};
struct bpf_local_storage_data {
......@@ -122,7 +126,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache);
struct bpf_local_storage_cache *cache,
bool bpf_ma);
struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
......
......@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
/* kmem_cache_alloc/free equivalent: */
void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_cache_raw_free(void *ptr);
void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
#endif /* _BPF_MEM_ALLOC_H */
......@@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
return bpf_local_storage_map_alloc(attr, &cgroup_cache);
return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
}
static void cgroup_storage_map_free(struct bpf_map *map)
......
......@@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
return bpf_local_storage_map_alloc(attr, &inode_cache);
return bpf_local_storage_map_alloc(attr, &inode_cache, false);
}
static void inode_storage_map_free(struct bpf_map *map)
......
This diff is collapsed.
......@@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
return bpf_local_storage_map_alloc(attr, &task_cache);
return bpf_local_storage_map_alloc(attr, &task_cache, true);
}
static void task_storage_map_free(struct bpf_map *map)
......
......@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
return entry;
}
static void *__alloc(struct bpf_mem_cache *c, int node)
static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here.
*/
gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
if (c->percpu_size) {
void **obj = kmalloc_node(c->percpu_size, flags, node);
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
......@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
*/
obj = __llist_del_first(&c->free_by_rcu);
if (!obj) {
obj = __alloc(c, node);
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here.
*/
obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
if (!obj)
break;
}
......@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr);
}
/* Directly does a kfree() without putting 'ptr' back to the free_llist
* for reuse and without waiting for a rcu_tasks_trace gp.
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
* before calling bpf_mem_cache_raw_free().
* It could be used when the rcu_tasks_trace callback does not have
* a hold on the original bpf_mem_alloc object that allocated the
* 'ptr'. This should only be used in the uncommon code path.
* Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
* and may affect performance.
*/
void bpf_mem_cache_raw_free(void *ptr)
{
if (!ptr)
return;
kfree(ptr - LLIST_NODE_SZ);
}
/* When flags == GFP_KERNEL, it signals that the caller will not cause
* deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
* kmalloc if the free_llist is empty.
*/
void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
{
struct bpf_mem_cache *c;
void *ret;
c = this_cpu_ptr(ma->cache);
ret = unit_alloc(c);
if (!ret && flags == GFP_KERNEL) {
struct mem_cgroup *memcg, *old_memcg;
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
......@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
return bpf_local_storage_map_alloc(attr, &sk_cache);
return bpf_local_storage_map_alloc(attr, &sk_cache, false);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
......
......@@ -278,6 +278,7 @@ extern struct argp bench_local_storage_argp;
extern struct argp bench_local_storage_rcu_tasks_trace_argp;
extern struct argp bench_strncmp_argp;
extern struct argp bench_hashmap_lookup_argp;
extern struct argp bench_local_storage_create_argp;
static const struct argp_child bench_parsers[] = {
{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
......@@ -288,6 +289,7 @@ static const struct argp_child bench_parsers[] = {
{ &bench_local_storage_rcu_tasks_trace_argp, 0,
"local_storage RCU Tasks Trace slowdown benchmark", 0 },
{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
{},
};
......
......@@ -3,19 +3,71 @@
#include <sys/types.h>
#include <sys/socket.h>
#include <pthread.h>
#include <argp.h>
#include "bench.h"
#include "bench_local_storage_create.skel.h"
#define BATCH_SZ 32
struct thread {
int fds[BATCH_SZ];
int *fds;
pthread_t *pthds;
int *pthd_results;
};
static struct bench_local_storage_create *skel;
static struct thread *threads;
static long socket_errs;
static long create_owner_errs;
static int storage_type = BPF_MAP_TYPE_SK_STORAGE;
static int batch_sz = 32;
enum {
ARG_BATCH_SZ = 9000,
ARG_STORAGE_TYPE = 9001,
};
static const struct argp_option opts[] = {
{ "batch-size", ARG_BATCH_SZ, "BATCH_SIZE", 0,
"The number of storage creations in each batch" },
{ "storage-type", ARG_STORAGE_TYPE, "STORAGE_TYPE", 0,
"The type of local storage to test (socket or task)" },
{},
};
static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
int ret;
switch (key) {
case ARG_BATCH_SZ:
ret = atoi(arg);
if (ret < 1) {
fprintf(stderr, "invalid batch-size\n");
argp_usage(state);
}
batch_sz = ret;
break;
case ARG_STORAGE_TYPE:
if (!strcmp(arg, "task")) {
storage_type = BPF_MAP_TYPE_TASK_STORAGE;
} else if (!strcmp(arg, "socket")) {
storage_type = BPF_MAP_TYPE_SK_STORAGE;
} else {
fprintf(stderr, "invalid storage-type (socket or task)\n");
argp_usage(state);
}
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
const struct argp bench_local_storage_create_argp = {
.options = opts,
.parser = parse_arg,
};
static void validate(void)
{
......@@ -28,6 +80,8 @@ static void validate(void)
static void setup(void)
{
int i;
skel = bench_local_storage_create__open_and_load();
if (!skel) {
fprintf(stderr, "error loading skel\n");
......@@ -35,10 +89,16 @@ static void setup(void)
}
skel->bss->bench_pid = getpid();
if (!bpf_program__attach(skel->progs.socket_post_create)) {
fprintf(stderr, "Error attaching bpf program\n");
exit(1);
if (storage_type == BPF_MAP_TYPE_SK_STORAGE) {
if (!bpf_program__attach(skel->progs.socket_post_create)) {
fprintf(stderr, "Error attaching bpf program\n");
exit(1);
}
} else {
if (!bpf_program__attach(skel->progs.fork)) {
fprintf(stderr, "Error attaching bpf program\n");
exit(1);
}
}
if (!bpf_program__attach(skel->progs.kmalloc)) {
......@@ -52,6 +112,29 @@ static void setup(void)
fprintf(stderr, "cannot alloc thread_res\n");
exit(1);
}
for (i = 0; i < env.producer_cnt; i++) {
struct thread *t = &threads[i];
if (storage_type == BPF_MAP_TYPE_SK_STORAGE) {
t->fds = malloc(batch_sz * sizeof(*t->fds));
if (!t->fds) {
fprintf(stderr, "cannot alloc t->fds\n");
exit(1);
}
} else {
t->pthds = malloc(batch_sz * sizeof(*t->pthds));
if (!t->pthds) {
fprintf(stderr, "cannot alloc t->pthds\n");
exit(1);
}
t->pthd_results = malloc(batch_sz * sizeof(*t->pthd_results));
if (!t->pthd_results) {
fprintf(stderr, "cannot alloc t->pthd_results\n");
exit(1);
}
}
}
}
static void measure(struct bench_res *res)
......@@ -65,20 +148,20 @@ static void *consumer(void *input)
return NULL;
}
static void *producer(void *input)
static void *sk_producer(void *input)
{
struct thread *t = &threads[(long)(input)];
int *fds = t->fds;
int i;
while (true) {
for (i = 0; i < BATCH_SZ; i++) {
for (i = 0; i < batch_sz; i++) {
fds[i] = socket(AF_INET6, SOCK_DGRAM, 0);
if (fds[i] == -1)
atomic_inc(&socket_errs);
atomic_inc(&create_owner_errs);
}
for (i = 0; i < BATCH_SZ; i++) {
for (i = 0; i < batch_sz; i++) {
if (fds[i] != -1)
close(fds[i]);
}
......@@ -87,6 +170,42 @@ static void *producer(void *input)
return NULL;
}
static void *thread_func(void *arg)
{
return NULL;
}
static void *task_producer(void *input)
{
struct thread *t = &threads[(long)(input)];
pthread_t *pthds = t->pthds;
int *pthd_results = t->pthd_results;
int i;
while (true) {
for (i = 0; i < batch_sz; i++) {
pthd_results[i] = pthread_create(&pthds[i], NULL, thread_func, NULL);
if (pthd_results[i])
atomic_inc(&create_owner_errs);
}
for (i = 0; i < batch_sz; i++) {
if (!pthd_results[i])
pthread_join(pthds[i], NULL);;
}
}
return NULL;
}
static void *producer(void *input)
{
if (storage_type == BPF_MAP_TYPE_SK_STORAGE)
return sk_producer(input);
else
return task_producer(input);
}
static void report_progress(int iter, struct bench_res *res, long delta_ns)
{
double creates_per_sec, kmallocs_per_create;
......@@ -123,14 +242,18 @@ static void report_final(struct bench_res res[], int res_cnt)
printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), ",
creates_mean, creates_stddev, creates_mean / env.producer_cnt);
printf("%4.2lf kmallocs/create\n", (double)total_kmallocs / total_creates);
if (socket_errs || skel->bss->create_errs)
printf("socket() errors %ld create_errs %ld\n", socket_errs,
if (create_owner_errs || skel->bss->create_errs)
printf("%s() errors %ld create_errs %ld\n",
storage_type == BPF_MAP_TYPE_SK_STORAGE ?
"socket" : "pthread_create",
create_owner_errs,
skel->bss->create_errs);
}
/* Benchmark performance of creating bpf local storage */
const struct bench bench_local_storage_create = {
.name = "local-storage-create",
.argp = &bench_local_storage_create_argp,
.validate = validate,
.setup = setup,
.producer_thread = producer,
......
......@@ -23,7 +23,7 @@ struct storage {
/* Fork and exec the provided rm binary and return the exit code of the
* forked process and its pid.
*/
static int run_self_unlink(int *monitored_pid, const char *rm_path)
static int run_self_unlink(struct local_storage *skel, const char *rm_path)
{
int child_pid, child_status, ret;
int null_fd;
......@@ -35,7 +35,7 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path)
dup2(null_fd, STDERR_FILENO);
close(null_fd);
*monitored_pid = getpid();
skel->bss->monitored_pid = getpid();
/* Use the copied /usr/bin/rm to delete itself
* /tmp/copy_of_rm /tmp/copy_of_rm.
*/
......@@ -44,6 +44,7 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path)
exit(errno);
} else if (child_pid > 0) {
waitpid(child_pid, &child_status, 0);
ASSERT_EQ(skel->data->task_storage_result, 0, "task_storage_result");
return WEXITSTATUS(child_status);
}
......@@ -133,7 +134,7 @@ void test_test_local_storage(void)
* unlink its executable. This operation should be denied by the loaded
* LSM program.
*/
err = run_self_unlink(&skel->bss->monitored_pid, tmp_exec_path);
err = run_self_unlink(skel, tmp_exec_path);
if (!ASSERT_EQ(err, EPERM, "run_self_unlink"))
goto close_prog_rmdir;
......
......@@ -22,6 +22,13 @@ struct {
__type(value, struct storage);
} sk_storage_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct storage);
} task_storage_map SEC(".maps");
SEC("raw_tp/kmalloc")
int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr,
size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags,
......@@ -32,6 +39,24 @@ int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr,
return 0;
}
SEC("tp_btf/sched_process_fork")
int BPF_PROG(fork, struct task_struct *parent, struct task_struct *child)
{
struct storage *stg;
if (parent->tgid != bench_pid)
return 0;
stg = bpf_task_storage_get(&task_storage_map, child, NULL,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (stg)
__sync_fetch_and_add(&create_cnts, 1);
else
__sync_fetch_and_add(&create_errs, 1);
return 0;
}
SEC("lsm.s/socket_post_create")
int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
int protocol, int kern)
......
......@@ -16,6 +16,7 @@ char _license[] SEC("license") = "GPL";
int monitored_pid = 0;
int inode_storage_result = -1;
int sk_storage_result = -1;
int task_storage_result = -1;
struct local_storage {
struct inode *exec_inode;
......@@ -50,26 +51,57 @@ struct {
__type(value, struct local_storage);
} task_storage_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct local_storage);
} task_storage_map2 SEC(".maps");
SEC("lsm/inode_unlink")
int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
{
__u32 pid = bpf_get_current_pid_tgid() >> 32;
struct bpf_local_storage *local_storage;
struct local_storage *storage;
struct task_struct *task;
bool is_self_unlink;
if (pid != monitored_pid)
return 0;
storage = bpf_task_storage_get(&task_storage_map,
bpf_get_current_task_btf(), 0, 0);
if (storage) {
/* Don't let an executable delete itself */
is_self_unlink = storage->exec_inode == victim->d_inode;
if (is_self_unlink)
return -EPERM;
}
task = bpf_get_current_task_btf();
if (!task)
return 0;
return 0;
task_storage_result = -1;
storage = bpf_task_storage_get(&task_storage_map, task, 0, 0);
if (!storage)
return 0;
/* Don't let an executable delete itself */
is_self_unlink = storage->exec_inode == victim->d_inode;
storage = bpf_task_storage_get(&task_storage_map2, task, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!storage || storage->value)
return 0;
if (bpf_task_storage_delete(&task_storage_map, task))
return 0;
/* Ensure that the task_storage_map is disconnected from the storage.
* The storage memory should not be freed back to the
* bpf_mem_alloc.
*/
local_storage = task->bpf_storage;
if (!local_storage || local_storage->smap)
return 0;
task_storage_result = 0;
return is_self_unlink ? -EPERM : 0;
}
SEC("lsm.s/inode_rename")
......@@ -139,11 +171,7 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
if (bpf_sk_storage_delete(&sk_storage_map, sock->sk))
return 0;
/* Ensure that the sk_storage_map is disconnected from the storage.
* The storage memory should not be freed back to the
* bpf_mem_alloc of the sk_bpf_storage_map because
* sk_bpf_storage_map may have been gone.
*/
/* Ensure that the sk_storage_map is disconnected from the storage. */
if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap)
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment