Commit b741f163 authored by Roman Gushchin's avatar Roman Gushchin Committed by Daniel Borkmann

bpf: introduce per-cpu cgroup local storage

This commit introduced per-cpu cgroup local storage.

Per-cpu cgroup local storage is very similar to simple cgroup storage
(let's call it shared), except all the data is per-cpu.

The main goal of per-cpu variant is to implement super fast
counters (e.g. packet counters), which don't require neither
lookups, neither atomic operations.

>From userspace's point of view, accessing a per-cpu cgroup storage
is similar to other per-cpu map types (e.g. per-cpu hashmaps and
arrays).

Writing to a per-cpu cgroup storage is not atomic, but is performed
by copying longs, so some minimal atomicity is here, exactly
as with other per-cpu maps.
Signed-off-by: default avatarRoman Gushchin <guro@fb.com>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Alexei Starovoitov <ast@kernel.org>
Acked-by: default avatarSong Liu <songliubraving@fb.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parent f294b37e
...@@ -37,7 +37,10 @@ struct bpf_storage_buffer { ...@@ -37,7 +37,10 @@ struct bpf_storage_buffer {
}; };
struct bpf_cgroup_storage { struct bpf_cgroup_storage {
union {
struct bpf_storage_buffer *buf; struct bpf_storage_buffer *buf;
void __percpu *percpu_buf;
};
struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_map *map;
struct bpf_cgroup_storage_key key; struct bpf_cgroup_storage_key key;
struct list_head list; struct list_head list;
...@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, ...@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
static inline enum bpf_cgroup_storage_type cgroup_storage_type( static inline enum bpf_cgroup_storage_type cgroup_storage_type(
struct bpf_map *map) struct bpf_map *map)
{ {
if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
return BPF_CGROUP_STORAGE_PERCPU;
return BPF_CGROUP_STORAGE_SHARED; return BPF_CGROUP_STORAGE_SHARED;
} }
...@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); ...@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
void *value, u64 flags);
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \ ({ \
...@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( ...@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
static inline void bpf_cgroup_storage_free( static inline void bpf_cgroup_storage_free(
struct bpf_cgroup_storage *storage) {} struct bpf_cgroup_storage *storage) {}
static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
void *value) {
return 0;
}
static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
void *key, void *value, u64 flags) {
return 0;
}
#define cgroup_bpf_enabled (0) #define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
......
...@@ -274,6 +274,7 @@ struct bpf_prog_offload { ...@@ -274,6 +274,7 @@ struct bpf_prog_offload {
enum bpf_cgroup_storage_type { enum bpf_cgroup_storage_type {
BPF_CGROUP_STORAGE_SHARED, BPF_CGROUP_STORAGE_SHARED,
BPF_CGROUP_STORAGE_PERCPU,
__BPF_CGROUP_STORAGE_MAX __BPF_CGROUP_STORAGE_MAX
}; };
......
...@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) ...@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
#endif #endif
#ifdef CONFIG_CGROUP_BPF #ifdef CONFIG_CGROUP_BPF
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
#endif #endif
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
......
...@@ -127,6 +127,7 @@ enum bpf_map_type { ...@@ -127,6 +127,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
}; };
enum bpf_prog_type { enum bpf_prog_type {
......
...@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) ...@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
*/ */
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
struct bpf_cgroup_storage *storage; struct bpf_cgroup_storage *storage;
void *ptr;
storage = this_cpu_read(bpf_cgroup_storage[stype]); storage = this_cpu_read(bpf_cgroup_storage[stype]);
return (unsigned long)&READ_ONCE(storage->buf)->data[0]; if (stype == BPF_CGROUP_STORAGE_SHARED)
ptr = &READ_ONCE(storage->buf)->data[0];
else
ptr = this_cpu_ptr(storage->percpu_buf);
return (unsigned long)ptr;
} }
const struct bpf_func_proto bpf_get_local_storage_proto = { const struct bpf_func_proto bpf_get_local_storage_proto = {
......
...@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, ...@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return 0; return 0;
} }
int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
void *value)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu, off = 0;
u32 size;
rcu_read_lock();
storage = cgroup_storage_lookup(map, key, false);
if (!storage) {
rcu_read_unlock();
return -ENOENT;
}
/* per_cpu areas are zero-filled and bpf programs can only
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
size = round_up(_map->value_size, 8);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off,
per_cpu_ptr(storage->percpu_buf, cpu), size);
off += size;
}
rcu_read_unlock();
return 0;
}
int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
void *value, u64 map_flags)
{
struct bpf_cgroup_storage_map *map = map_to_storage(_map);
struct bpf_cgroup_storage_key *key = _key;
struct bpf_cgroup_storage *storage;
int cpu, off = 0;
u32 size;
if (map_flags != BPF_ANY && map_flags != BPF_EXIST)
return -EINVAL;
rcu_read_lock();
storage = cgroup_storage_lookup(map, key, false);
if (!storage) {
rcu_read_unlock();
return -ENOENT;
}
/* the user space will provide round_up(value_size, 8) bytes that
* will be copied into per-cpu area. bpf programs can only access
* value_size of it. During lookup the same extra bytes will be
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
size = round_up(_map->value_size, 8);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
value + off, size);
off += size;
}
rcu_read_unlock();
return 0;
}
static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
void *_next_key) void *_next_key)
{ {
...@@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) ...@@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map)
spin_unlock_bh(&map->lock); spin_unlock_bh(&map->lock);
} }
static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
{
size_t size;
if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) {
size = sizeof(struct bpf_storage_buffer) + map->value_size;
*pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
PAGE_SIZE) >> PAGE_SHIFT;
} else {
size = map->value_size;
*pages = round_up(round_up(size, 8) * num_possible_cpus(),
PAGE_SIZE) >> PAGE_SHIFT;
}
return size;
}
struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
enum bpf_cgroup_storage_type stype) enum bpf_cgroup_storage_type stype)
{ {
struct bpf_cgroup_storage *storage; struct bpf_cgroup_storage *storage;
struct bpf_map *map; struct bpf_map *map;
gfp_t flags;
size_t size;
u32 pages; u32 pages;
map = prog->aux->cgroup_storage[stype]; map = prog->aux->cgroup_storage[stype];
if (!map) if (!map)
return NULL; return NULL;
pages = round_up(sizeof(struct bpf_cgroup_storage) + size = bpf_cgroup_storage_calculate_size(map, &pages);
sizeof(struct bpf_storage_buffer) +
map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
if (bpf_map_charge_memlock(map, pages)) if (bpf_map_charge_memlock(map, pages))
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
__GFP_ZERO | GFP_USER, map->numa_node); __GFP_ZERO | GFP_USER, map->numa_node);
if (!storage) { if (!storage)
bpf_map_uncharge_memlock(map, pages); goto enomem;
return ERR_PTR(-ENOMEM);
flags = __GFP_ZERO | GFP_USER;
if (stype == BPF_CGROUP_STORAGE_SHARED) {
storage->buf = kmalloc_node(size, flags, map->numa_node);
if (!storage->buf)
goto enomem;
} else {
storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
if (!storage->percpu_buf)
goto enomem;
} }
storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + storage->map = (struct bpf_cgroup_storage_map *)map;
map->value_size, __GFP_ZERO | GFP_USER,
map->numa_node); return storage;
if (!storage->buf) {
enomem:
bpf_map_uncharge_memlock(map, pages); bpf_map_uncharge_memlock(map, pages);
kfree(storage); kfree(storage);
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
storage->map = (struct bpf_cgroup_storage_map *)map; static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
{
struct bpf_cgroup_storage *storage =
container_of(rcu, struct bpf_cgroup_storage, rcu);
return storage; kfree(storage->buf);
kfree(storage);
}
static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
{
struct bpf_cgroup_storage *storage =
container_of(rcu, struct bpf_cgroup_storage, rcu);
free_percpu(storage->percpu_buf);
kfree(storage);
} }
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
{ {
u32 pages; enum bpf_cgroup_storage_type stype;
struct bpf_map *map; struct bpf_map *map;
u32 pages;
if (!storage) if (!storage)
return; return;
map = &storage->map->map; map = &storage->map->map;
pages = round_up(sizeof(struct bpf_cgroup_storage) +
sizeof(struct bpf_storage_buffer) + bpf_cgroup_storage_calculate_size(map, &pages);
map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
bpf_map_uncharge_memlock(map, pages); bpf_map_uncharge_memlock(map, pages);
kfree_rcu(storage->buf, rcu); stype = cgroup_storage_type(map);
kfree_rcu(storage, rcu); if (stype == BPF_CGROUP_STORAGE_SHARED)
call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
else
call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
} }
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
......
...@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
value_size = round_up(map->value_size, 8) * num_possible_cpus(); value_size = round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map)) else if (IS_FD_MAP(map))
value_size = sizeof(u32); value_size = sizeof(u32);
...@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value); err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value); err = bpf_stackmap_copy(map, key, value);
} else if (IS_FD_ARRAY(map)) { } else if (IS_FD_ARRAY(map)) {
...@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
value_size = round_up(map->value_size, 8) * num_possible_cpus(); value_size = round_up(map->value_size, 8) * num_possible_cpus();
else else
value_size = map->value_size; value_size = map->value_size;
...@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags); err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags); err = bpf_percpu_array_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
err = bpf_percpu_cgroup_storage_update(map, key, value,
attr->flags);
} else if (IS_FD_ARRAY(map)) { } else if (IS_FD_ARRAY(map)) {
rcu_read_lock(); rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value, err = bpf_fd_array_map_update_elem(map, f.file, key, value,
......
...@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, ...@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error; goto error;
break; break;
case BPF_MAP_TYPE_CGROUP_STORAGE: case BPF_MAP_TYPE_CGROUP_STORAGE:
case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
if (func_id != BPF_FUNC_get_local_storage) if (func_id != BPF_FUNC_get_local_storage)
goto error; goto error;
break; break;
...@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, ...@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error; goto error;
break; break;
case BPF_FUNC_get_local_storage: case BPF_FUNC_get_local_storage:
if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
goto error; goto error;
break; break;
case BPF_FUNC_sk_select_reuseport: case BPF_FUNC_sk_select_reuseport:
...@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, ...@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return 0; return 0;
} }
static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
{
return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
}
/* look for pseudo eBPF instructions that access map FDs and /* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers * replace them with actual map pointers
*/ */
...@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) ...@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
} }
env->used_maps[env->used_map_cnt++] = map; env->used_maps[env->used_map_cnt++] = map;
if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && if (bpf_map_is_cgroup_storage(map) &&
bpf_cgroup_storage_assign(env->prog, map)) { bpf_cgroup_storage_assign(env->prog, map)) {
verbose(env, verbose(env, "only one cgroup storage of each type is allowed\n");
"only one cgroup storage is allowed\n");
fdput(f); fdput(f);
return -EBUSY; return -EBUSY;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment