Commit 8d275960 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf: Use bpf_mem_cache_alloc/free in bpf_local_storage'

Martin KaFai Lau says:

====================

From: Martin KaFai Lau <martin.lau@kernel.org>

This set is a continuation of the effort in using
bpf_mem_cache_alloc/free in bpf_local_storage [1]

Major change is only using bpf_mem_alloc for task and cgrp storage
while sk and inode stay with kzalloc/kfree. The details is
in patch 2.

[1]: https://lore.kernel.org/bpf/20230308065936.1550103-1-martin.lau@linux.dev/

v3:
- Only use bpf_mem_alloc for task and cgrp storage.
- sk and inode storage stay with kzalloc/kfree.
- Check NULL and add comments in bpf_mem_cache_raw_free() in patch 1.
- Added test and benchmark for task storage.

v2:
- Added bpf_mem_cache_alloc_flags() and bpf_mem_cache_raw_free()
  to hide the internal data structure of the bpf allocator.
- Fixed a typo bug in bpf_selem_free()
- Simplified the test_local_storage test by directly using
  err returned from libbpf
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents e9936076 cbe9d93d
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/hash.h> #include <linux/hash.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/bpf_mem_alloc.h>
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
#define BPF_LOCAL_STORAGE_CACHE_SIZE 16 #define BPF_LOCAL_STORAGE_CACHE_SIZE 16
...@@ -55,6 +56,9 @@ struct bpf_local_storage_map { ...@@ -55,6 +56,9 @@ struct bpf_local_storage_map {
u32 bucket_log; u32 bucket_log;
u16 elem_size; u16 elem_size;
u16 cache_idx; u16 cache_idx;
struct bpf_mem_alloc selem_ma;
struct bpf_mem_alloc storage_ma;
bool bpf_ma;
}; };
struct bpf_local_storage_data { struct bpf_local_storage_data {
...@@ -122,7 +126,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr); ...@@ -122,7 +126,8 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
struct bpf_map * struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr, bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache); struct bpf_local_storage_cache *cache,
bool bpf_ma);
struct bpf_local_storage_data * struct bpf_local_storage_data *
bpf_local_storage_lookup(struct bpf_local_storage *local_storage, bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
......
...@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr); ...@@ -31,5 +31,7 @@ void bpf_mem_free(struct bpf_mem_alloc *ma, void *ptr);
/* kmem_cache_alloc/free equivalent: */ /* kmem_cache_alloc/free equivalent: */
void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma); void *bpf_mem_cache_alloc(struct bpf_mem_alloc *ma);
void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr); void bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr);
void bpf_mem_cache_raw_free(void *ptr);
void *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags);
#endif /* _BPF_MEM_ALLOC_H */ #endif /* _BPF_MEM_ALLOC_H */
...@@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) ...@@ -149,7 +149,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{ {
return bpf_local_storage_map_alloc(attr, &cgroup_cache); return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
} }
static void cgroup_storage_map_free(struct bpf_map *map) static void cgroup_storage_map_free(struct bpf_map *map)
......
...@@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, ...@@ -199,7 +199,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr) static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{ {
return bpf_local_storage_map_alloc(attr, &inode_cache); return bpf_local_storage_map_alloc(attr, &inode_cache, false);
} }
static void inode_storage_map_free(struct bpf_map *map) static void inode_storage_map_free(struct bpf_map *map)
......
...@@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, ...@@ -80,8 +80,24 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (charge_mem && mem_charge(smap, owner, smap->elem_size)) if (charge_mem && mem_charge(smap, owner, smap->elem_size))
return NULL; return NULL;
if (smap->bpf_ma) {
migrate_disable();
selem = bpf_mem_cache_alloc_flags(&smap->selem_ma, gfp_flags);
migrate_enable();
if (selem)
/* Keep the original bpf_map_kzalloc behavior
* before started using the bpf_mem_cache_alloc.
*
* No need to use zero_map_value. The bpf_selem_free()
* only does bpf_mem_cache_free when there is
* no other bpf prog is using the selem.
*/
memset(SDATA(selem)->data, 0, smap->map.value_size);
} else {
selem = bpf_map_kzalloc(&smap->map, smap->elem_size, selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
gfp_flags | __GFP_NOWARN); gfp_flags | __GFP_NOWARN);
}
if (selem) { if (selem) {
if (value) if (value)
copy_map_value(&smap->map, SDATA(selem)->data, value); copy_map_value(&smap->map, SDATA(selem)->data, value);
...@@ -95,41 +111,104 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, ...@@ -95,41 +111,104 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL; return NULL;
} }
static void bpf_local_storage_free_rcu(struct rcu_head *rcu) /* rcu tasks trace callback for bpf_ma == false */
static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{ {
struct bpf_local_storage *local_storage; struct bpf_local_storage *local_storage;
/* If RCU Tasks Trace grace period implies RCU grace period, do
* kfree(), else do kfree_rcu().
*/
local_storage = container_of(rcu, struct bpf_local_storage, rcu); local_storage = container_of(rcu, struct bpf_local_storage, rcu);
if (rcu_trace_implies_rcu_gp())
kfree(local_storage); kfree(local_storage);
else
kfree_rcu(local_storage, rcu);
}
static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
bpf_mem_cache_raw_free(local_storage);
} }
static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu) static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{ {
/* If RCU Tasks Trace grace period implies RCU grace period, do
* kfree(), else do kfree_rcu().
*/
if (rcu_trace_implies_rcu_gp()) if (rcu_trace_implies_rcu_gp())
bpf_local_storage_free_rcu(rcu); bpf_local_storage_free_rcu(rcu);
else else
call_rcu(rcu, bpf_local_storage_free_rcu); call_rcu(rcu, bpf_local_storage_free_rcu);
} }
/* Handle bpf_ma == false */
static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
bool vanilla_rcu)
{
if (vanilla_rcu)
kfree_rcu(local_storage, rcu);
else
call_rcu_tasks_trace(&local_storage->rcu,
__bpf_local_storage_free_trace_rcu);
}
static void bpf_local_storage_free(struct bpf_local_storage *local_storage, static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bool reuse_now) struct bpf_local_storage_map *smap,
bool bpf_ma, bool reuse_now)
{ {
if (!reuse_now) if (!bpf_ma) {
__bpf_local_storage_free(local_storage, reuse_now);
return;
}
if (!reuse_now) {
call_rcu_tasks_trace(&local_storage->rcu, call_rcu_tasks_trace(&local_storage->rcu,
bpf_local_storage_free_trace_rcu); bpf_local_storage_free_trace_rcu);
else return;
}
if (smap) {
migrate_disable();
bpf_mem_cache_free(&smap->storage_ma, local_storage);
migrate_enable();
} else {
/* smap could be NULL if the selem that triggered
* this 'local_storage' creation had been long gone.
* In this case, directly do call_rcu().
*/
call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu); call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
}
} }
static void bpf_selem_free_rcu(struct rcu_head *rcu) /* rcu tasks trace callback for bpf_ma == false */
static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{ {
struct bpf_local_storage_elem *selem; struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu); selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
if (rcu_trace_implies_rcu_gp())
kfree(selem); kfree(selem);
else
kfree_rcu(selem, rcu);
}
/* Handle bpf_ma == false */
static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
bool vanilla_rcu)
{
if (vanilla_rcu)
kfree_rcu(selem, rcu);
else
call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
bpf_mem_cache_raw_free(selem);
} }
static void bpf_selem_free_trace_rcu(struct rcu_head *rcu) static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
...@@ -145,10 +224,23 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem, ...@@ -145,10 +224,23 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now) bool reuse_now)
{ {
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data); bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
if (!reuse_now)
if (!smap->bpf_ma) {
__bpf_selem_free(selem, reuse_now);
return;
}
if (!reuse_now) {
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu); call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_trace_rcu);
else } else {
call_rcu(&selem->rcu, bpf_selem_free_rcu); /* Instead of using the vanilla call_rcu(),
* bpf_mem_cache_free will be able to reuse selem
* immediately.
*/
migrate_disable();
bpf_mem_cache_free(&smap->selem_ma, selem);
migrate_enable();
}
} }
/* local_storage->lock must be held and selem->local_storage == local_storage. /* local_storage->lock must be held and selem->local_storage == local_storage.
...@@ -209,11 +301,47 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor ...@@ -209,11 +301,47 @@ static bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_stor
return free_local_storage; return free_local_storage;
} }
static bool check_storage_bpf_ma(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *storage_smap,
struct bpf_local_storage_elem *selem)
{
struct bpf_local_storage_map *selem_smap;
/* local_storage->smap may be NULL. If it is, get the bpf_ma
* from any selem in the local_storage->list. The bpf_ma of all
* local_storage and selem should have the same value
* for the same map type.
*
* If the local_storage->list is already empty, the caller will not
* care about the bpf_ma value also because the caller is not
* responsibile to free the local_storage.
*/
if (storage_smap)
return storage_smap->bpf_ma;
if (!selem) {
struct hlist_node *n;
n = rcu_dereference_check(hlist_first_rcu(&local_storage->list),
bpf_rcu_lock_held());
if (!n)
return false;
selem = hlist_entry(n, struct bpf_local_storage_elem, snode);
}
selem_smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
return selem_smap->bpf_ma;
}
static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
bool reuse_now) bool reuse_now)
{ {
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage *local_storage; struct bpf_local_storage *local_storage;
bool free_local_storage = false; bool bpf_ma, free_local_storage = false;
unsigned long flags; unsigned long flags;
if (unlikely(!selem_linked_to_storage_lockless(selem))) if (unlikely(!selem_linked_to_storage_lockless(selem)))
...@@ -222,6 +350,10 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, ...@@ -222,6 +350,10 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
local_storage = rcu_dereference_check(selem->local_storage, local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held()); bpf_rcu_lock_held());
storage_smap = rcu_dereference_check(local_storage->smap,
bpf_rcu_lock_held());
bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, selem);
raw_spin_lock_irqsave(&local_storage->lock, flags); raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem))) if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock( free_local_storage = bpf_selem_unlink_storage_nolock(
...@@ -229,7 +361,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem, ...@@ -229,7 +361,7 @@ static void bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem,
raw_spin_unlock_irqrestore(&local_storage->lock, flags); raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage) if (free_local_storage)
bpf_local_storage_free(local_storage, reuse_now); bpf_local_storage_free(local_storage, storage_smap, bpf_ma, reuse_now);
} }
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage, void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
...@@ -349,8 +481,15 @@ int bpf_local_storage_alloc(void *owner, ...@@ -349,8 +481,15 @@ int bpf_local_storage_alloc(void *owner,
if (err) if (err)
return err; return err;
if (smap->bpf_ma) {
migrate_disable();
storage = bpf_mem_cache_alloc_flags(&smap->storage_ma, gfp_flags);
migrate_enable();
} else {
storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
gfp_flags | __GFP_NOWARN); gfp_flags | __GFP_NOWARN);
}
if (!storage) { if (!storage) {
err = -ENOMEM; err = -ENOMEM;
goto uncharge; goto uncharge;
...@@ -396,7 +535,7 @@ int bpf_local_storage_alloc(void *owner, ...@@ -396,7 +535,7 @@ int bpf_local_storage_alloc(void *owner,
return 0; return 0;
uncharge: uncharge:
bpf_local_storage_free(storage, true); bpf_local_storage_free(storage, smap, smap->bpf_ma, true);
mem_uncharge(smap, owner, sizeof(*storage)); mem_uncharge(smap, owner, sizeof(*storage));
return err; return err;
} }
...@@ -609,11 +748,15 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map, ...@@ -609,11 +748,15 @@ int bpf_local_storage_map_check_btf(const struct bpf_map *map,
void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
{ {
struct bpf_local_storage_map *storage_smap;
struct bpf_local_storage_elem *selem; struct bpf_local_storage_elem *selem;
bool free_storage = false; bool bpf_ma, free_storage = false;
struct hlist_node *n; struct hlist_node *n;
unsigned long flags; unsigned long flags;
storage_smap = rcu_dereference_check(local_storage->smap, bpf_rcu_lock_held());
bpf_ma = check_storage_bpf_ma(local_storage, storage_smap, NULL);
/* Neither the bpf_prog nor the bpf_map's syscall /* Neither the bpf_prog nor the bpf_map's syscall
* could be modifying the local_storage->list now. * could be modifying the local_storage->list now.
* Thus, no elem can be added to or deleted from the * Thus, no elem can be added to or deleted from the
...@@ -641,7 +784,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage) ...@@ -641,7 +784,7 @@ void bpf_local_storage_destroy(struct bpf_local_storage *local_storage)
raw_spin_unlock_irqrestore(&local_storage->lock, flags); raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_storage) if (free_storage)
bpf_local_storage_free(local_storage, true); bpf_local_storage_free(local_storage, storage_smap, bpf_ma, true);
} }
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
...@@ -654,13 +797,25 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map) ...@@ -654,13 +797,25 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
return usage; return usage;
} }
/* When bpf_ma == true, the bpf_mem_alloc is used to allocate and free memory.
* A deadlock free allocator is useful for storage that the bpf prog can easily
* get a hold of the owner PTR_TO_BTF_ID in any context. eg. bpf_get_current_task_btf.
* The task and cgroup storage fall into this case. The bpf_mem_alloc reuses
* memory immediately. To be reuse-immediate safe, the owner destruction
* code path needs to go through a rcu grace period before calling
* bpf_local_storage_destroy().
*
* When bpf_ma == false, the kmalloc and kfree are used.
*/
struct bpf_map * struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr, bpf_local_storage_map_alloc(union bpf_attr *attr,
struct bpf_local_storage_cache *cache) struct bpf_local_storage_cache *cache,
bool bpf_ma)
{ {
struct bpf_local_storage_map *smap; struct bpf_local_storage_map *smap;
unsigned int i; unsigned int i;
u32 nbuckets; u32 nbuckets;
int err;
smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE); smap = bpf_map_area_alloc(sizeof(*smap), NUMA_NO_NODE);
if (!smap) if (!smap)
...@@ -675,8 +830,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, ...@@ -675,8 +830,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets),
nbuckets, GFP_USER | __GFP_NOWARN); nbuckets, GFP_USER | __GFP_NOWARN);
if (!smap->buckets) { if (!smap->buckets) {
bpf_map_area_free(smap); err = -ENOMEM;
return ERR_PTR(-ENOMEM); goto free_smap;
} }
for (i = 0; i < nbuckets; i++) { for (i = 0; i < nbuckets; i++) {
...@@ -687,8 +842,26 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, ...@@ -687,8 +842,26 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem, smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]); sdata.data[attr->value_size]);
smap->bpf_ma = bpf_ma;
if (bpf_ma) {
err = bpf_mem_alloc_init(&smap->selem_ma, smap->elem_size, false);
if (err)
goto free_smap;
err = bpf_mem_alloc_init(&smap->storage_ma, sizeof(struct bpf_local_storage), false);
if (err) {
bpf_mem_alloc_destroy(&smap->selem_ma);
goto free_smap;
}
}
smap->cache_idx = bpf_local_storage_cache_idx_get(cache); smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map; return &smap->map;
free_smap:
kvfree(smap->buckets);
bpf_map_area_free(smap);
return ERR_PTR(err);
} }
void bpf_local_storage_map_free(struct bpf_map *map, void bpf_local_storage_map_free(struct bpf_map *map,
...@@ -754,6 +927,10 @@ void bpf_local_storage_map_free(struct bpf_map *map, ...@@ -754,6 +927,10 @@ void bpf_local_storage_map_free(struct bpf_map *map,
*/ */
synchronize_rcu(); synchronize_rcu();
if (smap->bpf_ma) {
bpf_mem_alloc_destroy(&smap->selem_ma);
bpf_mem_alloc_destroy(&smap->storage_ma);
}
kvfree(smap->buckets); kvfree(smap->buckets);
bpf_map_area_free(smap); bpf_map_area_free(smap);
} }
...@@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) ...@@ -309,7 +309,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{ {
return bpf_local_storage_map_alloc(attr, &task_cache); return bpf_local_storage_map_alloc(attr, &task_cache, true);
} }
static void task_storage_map_free(struct bpf_map *map) static void task_storage_map_free(struct bpf_map *map)
......
...@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head) ...@@ -121,15 +121,8 @@ static struct llist_node notrace *__llist_del_first(struct llist_head *head)
return entry; return entry;
} }
static void *__alloc(struct bpf_mem_cache *c, int node) static void *__alloc(struct bpf_mem_cache *c, int node, gfp_t flags)
{ {
/* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here.
*/
gfp_t flags = GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT;
if (c->percpu_size) { if (c->percpu_size) {
void **obj = kmalloc_node(c->percpu_size, flags, node); void **obj = kmalloc_node(c->percpu_size, flags, node);
void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags); void *pptr = __alloc_percpu_gfp(c->unit_size, 8, flags);
...@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node) ...@@ -185,7 +178,12 @@ static void alloc_bulk(struct bpf_mem_cache *c, int cnt, int node)
*/ */
obj = __llist_del_first(&c->free_by_rcu); obj = __llist_del_first(&c->free_by_rcu);
if (!obj) { if (!obj) {
obj = __alloc(c, node); /* Allocate, but don't deplete atomic reserves that typical
* GFP_ATOMIC would do. irq_work runs on this cpu and kmalloc
* will allocate from the current numa node which is what we
* want here.
*/
obj = __alloc(c, node, GFP_NOWAIT | __GFP_NOWARN | __GFP_ACCOUNT);
if (!obj) if (!obj)
break; break;
} }
...@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr) ...@@ -676,3 +674,46 @@ void notrace bpf_mem_cache_free(struct bpf_mem_alloc *ma, void *ptr)
unit_free(this_cpu_ptr(ma->cache), ptr); unit_free(this_cpu_ptr(ma->cache), ptr);
} }
/* Directly does a kfree() without putting 'ptr' back to the free_llist
* for reuse and without waiting for a rcu_tasks_trace gp.
* The caller must first go through the rcu_tasks_trace gp for 'ptr'
* before calling bpf_mem_cache_raw_free().
* It could be used when the rcu_tasks_trace callback does not have
* a hold on the original bpf_mem_alloc object that allocated the
* 'ptr'. This should only be used in the uncommon code path.
* Otherwise, the bpf_mem_alloc's free_llist cannot be refilled
* and may affect performance.
*/
void bpf_mem_cache_raw_free(void *ptr)
{
if (!ptr)
return;
kfree(ptr - LLIST_NODE_SZ);
}
/* When flags == GFP_KERNEL, it signals that the caller will not cause
* deadlock when using kmalloc. bpf_mem_cache_alloc_flags() will use
* kmalloc if the free_llist is empty.
*/
void notrace *bpf_mem_cache_alloc_flags(struct bpf_mem_alloc *ma, gfp_t flags)
{
struct bpf_mem_cache *c;
void *ret;
c = this_cpu_ptr(ma->cache);
ret = unit_alloc(c);
if (!ret && flags == GFP_KERNEL) {
struct mem_cgroup *memcg, *old_memcg;
memcg = get_memcg(c);
old_memcg = set_active_memcg(memcg);
ret = __alloc(c, NUMA_NO_NODE, GFP_KERNEL | __GFP_NOWARN | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
mem_cgroup_put(memcg);
}
return !ret ? NULL : ret + LLIST_NODE_SZ;
}
...@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) ...@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{ {
return bpf_local_storage_map_alloc(attr, &sk_cache); return bpf_local_storage_map_alloc(attr, &sk_cache, false);
} }
static int notsupp_get_next_key(struct bpf_map *map, void *key, static int notsupp_get_next_key(struct bpf_map *map, void *key,
......
...@@ -278,6 +278,7 @@ extern struct argp bench_local_storage_argp; ...@@ -278,6 +278,7 @@ extern struct argp bench_local_storage_argp;
extern struct argp bench_local_storage_rcu_tasks_trace_argp; extern struct argp bench_local_storage_rcu_tasks_trace_argp;
extern struct argp bench_strncmp_argp; extern struct argp bench_strncmp_argp;
extern struct argp bench_hashmap_lookup_argp; extern struct argp bench_hashmap_lookup_argp;
extern struct argp bench_local_storage_create_argp;
static const struct argp_child bench_parsers[] = { static const struct argp_child bench_parsers[] = {
{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 }, { &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
...@@ -288,6 +289,7 @@ static const struct argp_child bench_parsers[] = { ...@@ -288,6 +289,7 @@ static const struct argp_child bench_parsers[] = {
{ &bench_local_storage_rcu_tasks_trace_argp, 0, { &bench_local_storage_rcu_tasks_trace_argp, 0,
"local_storage RCU Tasks Trace slowdown benchmark", 0 }, "local_storage RCU Tasks Trace slowdown benchmark", 0 },
{ &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 }, { &bench_hashmap_lookup_argp, 0, "Hashmap lookup benchmark", 0 },
{ &bench_local_storage_create_argp, 0, "local-storage-create benchmark", 0 },
{}, {},
}; };
......
...@@ -3,19 +3,71 @@ ...@@ -3,19 +3,71 @@
#include <sys/types.h> #include <sys/types.h>
#include <sys/socket.h> #include <sys/socket.h>
#include <pthread.h>
#include <argp.h>
#include "bench.h" #include "bench.h"
#include "bench_local_storage_create.skel.h" #include "bench_local_storage_create.skel.h"
#define BATCH_SZ 32
struct thread { struct thread {
int fds[BATCH_SZ]; int *fds;
pthread_t *pthds;
int *pthd_results;
}; };
static struct bench_local_storage_create *skel; static struct bench_local_storage_create *skel;
static struct thread *threads; static struct thread *threads;
static long socket_errs; static long create_owner_errs;
static int storage_type = BPF_MAP_TYPE_SK_STORAGE;
static int batch_sz = 32;
enum {
ARG_BATCH_SZ = 9000,
ARG_STORAGE_TYPE = 9001,
};
static const struct argp_option opts[] = {
{ "batch-size", ARG_BATCH_SZ, "BATCH_SIZE", 0,
"The number of storage creations in each batch" },
{ "storage-type", ARG_STORAGE_TYPE, "STORAGE_TYPE", 0,
"The type of local storage to test (socket or task)" },
{},
};
static error_t parse_arg(int key, char *arg, struct argp_state *state)
{
int ret;
switch (key) {
case ARG_BATCH_SZ:
ret = atoi(arg);
if (ret < 1) {
fprintf(stderr, "invalid batch-size\n");
argp_usage(state);
}
batch_sz = ret;
break;
case ARG_STORAGE_TYPE:
if (!strcmp(arg, "task")) {
storage_type = BPF_MAP_TYPE_TASK_STORAGE;
} else if (!strcmp(arg, "socket")) {
storage_type = BPF_MAP_TYPE_SK_STORAGE;
} else {
fprintf(stderr, "invalid storage-type (socket or task)\n");
argp_usage(state);
}
break;
default:
return ARGP_ERR_UNKNOWN;
}
return 0;
}
const struct argp bench_local_storage_create_argp = {
.options = opts,
.parser = parse_arg,
};
static void validate(void) static void validate(void)
{ {
...@@ -28,6 +80,8 @@ static void validate(void) ...@@ -28,6 +80,8 @@ static void validate(void)
static void setup(void) static void setup(void)
{ {
int i;
skel = bench_local_storage_create__open_and_load(); skel = bench_local_storage_create__open_and_load();
if (!skel) { if (!skel) {
fprintf(stderr, "error loading skel\n"); fprintf(stderr, "error loading skel\n");
...@@ -35,11 +89,17 @@ static void setup(void) ...@@ -35,11 +89,17 @@ static void setup(void)
} }
skel->bss->bench_pid = getpid(); skel->bss->bench_pid = getpid();
if (storage_type == BPF_MAP_TYPE_SK_STORAGE) {
if (!bpf_program__attach(skel->progs.socket_post_create)) { if (!bpf_program__attach(skel->progs.socket_post_create)) {
fprintf(stderr, "Error attaching bpf program\n"); fprintf(stderr, "Error attaching bpf program\n");
exit(1); exit(1);
} }
} else {
if (!bpf_program__attach(skel->progs.fork)) {
fprintf(stderr, "Error attaching bpf program\n");
exit(1);
}
}
if (!bpf_program__attach(skel->progs.kmalloc)) { if (!bpf_program__attach(skel->progs.kmalloc)) {
fprintf(stderr, "Error attaching bpf program\n"); fprintf(stderr, "Error attaching bpf program\n");
...@@ -52,6 +112,29 @@ static void setup(void) ...@@ -52,6 +112,29 @@ static void setup(void)
fprintf(stderr, "cannot alloc thread_res\n"); fprintf(stderr, "cannot alloc thread_res\n");
exit(1); exit(1);
} }
for (i = 0; i < env.producer_cnt; i++) {
struct thread *t = &threads[i];
if (storage_type == BPF_MAP_TYPE_SK_STORAGE) {
t->fds = malloc(batch_sz * sizeof(*t->fds));
if (!t->fds) {
fprintf(stderr, "cannot alloc t->fds\n");
exit(1);
}
} else {
t->pthds = malloc(batch_sz * sizeof(*t->pthds));
if (!t->pthds) {
fprintf(stderr, "cannot alloc t->pthds\n");
exit(1);
}
t->pthd_results = malloc(batch_sz * sizeof(*t->pthd_results));
if (!t->pthd_results) {
fprintf(stderr, "cannot alloc t->pthd_results\n");
exit(1);
}
}
}
} }
static void measure(struct bench_res *res) static void measure(struct bench_res *res)
...@@ -65,20 +148,20 @@ static void *consumer(void *input) ...@@ -65,20 +148,20 @@ static void *consumer(void *input)
return NULL; return NULL;
} }
static void *producer(void *input) static void *sk_producer(void *input)
{ {
struct thread *t = &threads[(long)(input)]; struct thread *t = &threads[(long)(input)];
int *fds = t->fds; int *fds = t->fds;
int i; int i;
while (true) { while (true) {
for (i = 0; i < BATCH_SZ; i++) { for (i = 0; i < batch_sz; i++) {
fds[i] = socket(AF_INET6, SOCK_DGRAM, 0); fds[i] = socket(AF_INET6, SOCK_DGRAM, 0);
if (fds[i] == -1) if (fds[i] == -1)
atomic_inc(&socket_errs); atomic_inc(&create_owner_errs);
} }
for (i = 0; i < BATCH_SZ; i++) { for (i = 0; i < batch_sz; i++) {
if (fds[i] != -1) if (fds[i] != -1)
close(fds[i]); close(fds[i]);
} }
...@@ -87,6 +170,42 @@ static void *producer(void *input) ...@@ -87,6 +170,42 @@ static void *producer(void *input)
return NULL; return NULL;
} }
static void *thread_func(void *arg)
{
return NULL;
}
static void *task_producer(void *input)
{
struct thread *t = &threads[(long)(input)];
pthread_t *pthds = t->pthds;
int *pthd_results = t->pthd_results;
int i;
while (true) {
for (i = 0; i < batch_sz; i++) {
pthd_results[i] = pthread_create(&pthds[i], NULL, thread_func, NULL);
if (pthd_results[i])
atomic_inc(&create_owner_errs);
}
for (i = 0; i < batch_sz; i++) {
if (!pthd_results[i])
pthread_join(pthds[i], NULL);;
}
}
return NULL;
}
static void *producer(void *input)
{
if (storage_type == BPF_MAP_TYPE_SK_STORAGE)
return sk_producer(input);
else
return task_producer(input);
}
static void report_progress(int iter, struct bench_res *res, long delta_ns) static void report_progress(int iter, struct bench_res *res, long delta_ns)
{ {
double creates_per_sec, kmallocs_per_create; double creates_per_sec, kmallocs_per_create;
...@@ -123,14 +242,18 @@ static void report_final(struct bench_res res[], int res_cnt) ...@@ -123,14 +242,18 @@ static void report_final(struct bench_res res[], int res_cnt)
printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), ", printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), ",
creates_mean, creates_stddev, creates_mean / env.producer_cnt); creates_mean, creates_stddev, creates_mean / env.producer_cnt);
printf("%4.2lf kmallocs/create\n", (double)total_kmallocs / total_creates); printf("%4.2lf kmallocs/create\n", (double)total_kmallocs / total_creates);
if (socket_errs || skel->bss->create_errs) if (create_owner_errs || skel->bss->create_errs)
printf("socket() errors %ld create_errs %ld\n", socket_errs, printf("%s() errors %ld create_errs %ld\n",
storage_type == BPF_MAP_TYPE_SK_STORAGE ?
"socket" : "pthread_create",
create_owner_errs,
skel->bss->create_errs); skel->bss->create_errs);
} }
/* Benchmark performance of creating bpf local storage */ /* Benchmark performance of creating bpf local storage */
const struct bench bench_local_storage_create = { const struct bench bench_local_storage_create = {
.name = "local-storage-create", .name = "local-storage-create",
.argp = &bench_local_storage_create_argp,
.validate = validate, .validate = validate,
.setup = setup, .setup = setup,
.producer_thread = producer, .producer_thread = producer,
......
...@@ -23,7 +23,7 @@ struct storage { ...@@ -23,7 +23,7 @@ struct storage {
/* Fork and exec the provided rm binary and return the exit code of the /* Fork and exec the provided rm binary and return the exit code of the
* forked process and its pid. * forked process and its pid.
*/ */
static int run_self_unlink(int *monitored_pid, const char *rm_path) static int run_self_unlink(struct local_storage *skel, const char *rm_path)
{ {
int child_pid, child_status, ret; int child_pid, child_status, ret;
int null_fd; int null_fd;
...@@ -35,7 +35,7 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path) ...@@ -35,7 +35,7 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path)
dup2(null_fd, STDERR_FILENO); dup2(null_fd, STDERR_FILENO);
close(null_fd); close(null_fd);
*monitored_pid = getpid(); skel->bss->monitored_pid = getpid();
/* Use the copied /usr/bin/rm to delete itself /* Use the copied /usr/bin/rm to delete itself
* /tmp/copy_of_rm /tmp/copy_of_rm. * /tmp/copy_of_rm /tmp/copy_of_rm.
*/ */
...@@ -44,6 +44,7 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path) ...@@ -44,6 +44,7 @@ static int run_self_unlink(int *monitored_pid, const char *rm_path)
exit(errno); exit(errno);
} else if (child_pid > 0) { } else if (child_pid > 0) {
waitpid(child_pid, &child_status, 0); waitpid(child_pid, &child_status, 0);
ASSERT_EQ(skel->data->task_storage_result, 0, "task_storage_result");
return WEXITSTATUS(child_status); return WEXITSTATUS(child_status);
} }
...@@ -133,7 +134,7 @@ void test_test_local_storage(void) ...@@ -133,7 +134,7 @@ void test_test_local_storage(void)
* unlink its executable. This operation should be denied by the loaded * unlink its executable. This operation should be denied by the loaded
* LSM program. * LSM program.
*/ */
err = run_self_unlink(&skel->bss->monitored_pid, tmp_exec_path); err = run_self_unlink(skel, tmp_exec_path);
if (!ASSERT_EQ(err, EPERM, "run_self_unlink")) if (!ASSERT_EQ(err, EPERM, "run_self_unlink"))
goto close_prog_rmdir; goto close_prog_rmdir;
......
...@@ -22,6 +22,13 @@ struct { ...@@ -22,6 +22,13 @@ struct {
__type(value, struct storage); __type(value, struct storage);
} sk_storage_map SEC(".maps"); } sk_storage_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct storage);
} task_storage_map SEC(".maps");
SEC("raw_tp/kmalloc") SEC("raw_tp/kmalloc")
int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr, int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr,
size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags, size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags,
...@@ -32,6 +39,24 @@ int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr, ...@@ -32,6 +39,24 @@ int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr,
return 0; return 0;
} }
SEC("tp_btf/sched_process_fork")
int BPF_PROG(fork, struct task_struct *parent, struct task_struct *child)
{
struct storage *stg;
if (parent->tgid != bench_pid)
return 0;
stg = bpf_task_storage_get(&task_storage_map, child, NULL,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (stg)
__sync_fetch_and_add(&create_cnts, 1);
else
__sync_fetch_and_add(&create_errs, 1);
return 0;
}
SEC("lsm.s/socket_post_create") SEC("lsm.s/socket_post_create")
int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, int BPF_PROG(socket_post_create, struct socket *sock, int family, int type,
int protocol, int kern) int protocol, int kern)
......
...@@ -16,6 +16,7 @@ char _license[] SEC("license") = "GPL"; ...@@ -16,6 +16,7 @@ char _license[] SEC("license") = "GPL";
int monitored_pid = 0; int monitored_pid = 0;
int inode_storage_result = -1; int inode_storage_result = -1;
int sk_storage_result = -1; int sk_storage_result = -1;
int task_storage_result = -1;
struct local_storage { struct local_storage {
struct inode *exec_inode; struct inode *exec_inode;
...@@ -50,26 +51,57 @@ struct { ...@@ -50,26 +51,57 @@ struct {
__type(value, struct local_storage); __type(value, struct local_storage);
} task_storage_map SEC(".maps"); } task_storage_map SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct local_storage);
} task_storage_map2 SEC(".maps");
SEC("lsm/inode_unlink") SEC("lsm/inode_unlink")
int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim)
{ {
__u32 pid = bpf_get_current_pid_tgid() >> 32; __u32 pid = bpf_get_current_pid_tgid() >> 32;
struct bpf_local_storage *local_storage;
struct local_storage *storage; struct local_storage *storage;
struct task_struct *task;
bool is_self_unlink; bool is_self_unlink;
if (pid != monitored_pid) if (pid != monitored_pid)
return 0; return 0;
storage = bpf_task_storage_get(&task_storage_map, task = bpf_get_current_task_btf();
bpf_get_current_task_btf(), 0, 0); if (!task)
if (storage) { return 0;
task_storage_result = -1;
storage = bpf_task_storage_get(&task_storage_map, task, 0, 0);
if (!storage)
return 0;
/* Don't let an executable delete itself */ /* Don't let an executable delete itself */
is_self_unlink = storage->exec_inode == victim->d_inode; is_self_unlink = storage->exec_inode == victim->d_inode;
if (is_self_unlink)
return -EPERM;
}
storage = bpf_task_storage_get(&task_storage_map2, task, 0,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!storage || storage->value)
return 0;
if (bpf_task_storage_delete(&task_storage_map, task))
return 0;
/* Ensure that the task_storage_map is disconnected from the storage.
* The storage memory should not be freed back to the
* bpf_mem_alloc.
*/
local_storage = task->bpf_storage;
if (!local_storage || local_storage->smap)
return 0; return 0;
task_storage_result = 0;
return is_self_unlink ? -EPERM : 0;
} }
SEC("lsm.s/inode_rename") SEC("lsm.s/inode_rename")
...@@ -139,11 +171,7 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address, ...@@ -139,11 +171,7 @@ int BPF_PROG(socket_bind, struct socket *sock, struct sockaddr *address,
if (bpf_sk_storage_delete(&sk_storage_map, sock->sk)) if (bpf_sk_storage_delete(&sk_storage_map, sock->sk))
return 0; return 0;
/* Ensure that the sk_storage_map is disconnected from the storage. /* Ensure that the sk_storage_map is disconnected from the storage. */
* The storage memory should not be freed back to the
* bpf_mem_alloc of the sk_bpf_storage_map because
* sk_bpf_storage_map may have been gone.
*/
if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap) if (!sock->sk->sk_bpf_storage || sock->sk->sk_bpf_storage->smap)
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment