Commit f14b488d authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-map-prealloc'

Alexei Starovoitov says:

====================
bpf: map pre-alloc

v1->v2:
. fix few issues spotted by Daniel
. converted stackmap into pre-allocation as well
. added a workaround for lockdep false positive
. added pcpu_freelist_populate to be used by hashmap and stackmap

this path set switches bpf hash map to use pre-allocation by default
and introduces BPF_F_NO_PREALLOC flag to keep old behavior for cases
where full map pre-allocation is too memory expensive.

Some time back Daniel Wagner reported crashes when bpf hash map is
used to compute time intervals between preempt_disable->preempt_enable
and recently Tom Zanussi reported a dead lock in iovisor/bcc/funccount
tool if it's used to count the number of invocations of kernel
'*spin*' functions. Both problems are due to the recursive use of
slub and can only be solved by pre-allocating all map elements.

A lot of different solutions were considered. Many implemented,
but at the end pre-allocation seems to be the only feasible answer.
As far as pre-allocation goes it also was implemented 4 different ways:
- simple free-list with single lock
- percpu_ida with optimizations
- blk-mq-tag variant customized for bpf use case
- percpu_freelist
For bpf style of alloc/free patterns percpu_freelist is the best
and implemented in this patch set.
Detailed performance numbers in patch 3.
Patch 2 introduces percpu_freelist
Patch 1 fixes simple deadlocks due to missing recursion checks
Patch 5: converts stackmap to pre-allocation
Patches 6-9: prepare test infra
Patch 10: stress test for hash map infra. It attaches to spin_lock
functions and bpf_map_update/delete are called from different contexts
Patch 11: stress for bpf_get_stackid
Patch 12: map performance test
Reported-by: default avatarDaniel Wagner <daniel.wagner@bmw-carit.de>
Reported-by: default avatarTom Zanussi <tom.zanussi@linux.intel.com>
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 8aba8b83 c3f85cff
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <uapi/linux/bpf.h> #include <uapi/linux/bpf.h>
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/file.h> #include <linux/file.h>
#include <linux/percpu.h>
struct bpf_map; struct bpf_map;
...@@ -36,6 +37,7 @@ struct bpf_map { ...@@ -36,6 +37,7 @@ struct bpf_map {
u32 key_size; u32 key_size;
u32 value_size; u32 value_size;
u32 max_entries; u32 max_entries;
u32 map_flags;
u32 pages; u32 pages;
struct user_struct *user; struct user_struct *user;
const struct bpf_map_ops *ops; const struct bpf_map_ops *ops;
...@@ -163,6 +165,8 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f ...@@ -163,6 +165,8 @@ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *f
const struct bpf_func_proto *bpf_get_trace_printk_proto(void); const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
#ifdef CONFIG_BPF_SYSCALL #ifdef CONFIG_BPF_SYSCALL
DECLARE_PER_CPU(int, bpf_prog_active);
void bpf_register_prog_type(struct bpf_prog_type_list *tl); void bpf_register_prog_type(struct bpf_prog_type_list *tl);
void bpf_register_map_type(struct bpf_map_type_list *tl); void bpf_register_map_type(struct bpf_map_type_list *tl);
...@@ -175,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f); ...@@ -175,6 +179,7 @@ struct bpf_map *__bpf_map_get(struct fd f);
void bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_inc(struct bpf_map *map, bool uref);
void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map); void bpf_map_put(struct bpf_map *map);
int bpf_map_precharge_memlock(u32 pages);
extern int sysctl_unprivileged_bpf_disabled; extern int sysctl_unprivileged_bpf_disabled;
...@@ -190,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, ...@@ -190,6 +195,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
u64 flags); u64 flags);
int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
u64 flags); u64 flags);
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value);
/* memcpy that is used with 8-byte aligned pointers, power-of-8 size and /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
* forced to use 'long' read/writes to try to atomically copy long counters. * forced to use 'long' read/writes to try to atomically copy long counters.
......
...@@ -101,12 +101,15 @@ enum bpf_prog_type { ...@@ -101,12 +101,15 @@ enum bpf_prog_type {
#define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */
#define BPF_EXIST 2 /* update existing element */ #define BPF_EXIST 2 /* update existing element */
#define BPF_F_NO_PREALLOC (1U << 0)
union bpf_attr { union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */ struct { /* anonymous struct used by BPF_MAP_CREATE command */
__u32 map_type; /* one of enum bpf_map_type */ __u32 map_type; /* one of enum bpf_map_type */
__u32 key_size; /* size of key in bytes */ __u32 key_size; /* size of key in bytes */
__u32 value_size; /* size of value in bytes */ __u32 value_size; /* size of value in bytes */
__u32 max_entries; /* max number of entries in a map */ __u32 max_entries; /* max number of entries in a map */
__u32 map_flags; /* prealloc or not */
}; };
struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
......
obj-y := core.o obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
ifeq ($(CONFIG_PERF_EVENTS),y) ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif endif
...@@ -53,7 +53,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) ...@@ -53,7 +53,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
/* check sanity of attributes */ /* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 || if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size == 0) attr->value_size == 0 || attr->map_flags)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
......
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public * modify it under the terms of version 2 of the GNU General Public
...@@ -13,6 +14,7 @@ ...@@ -13,6 +14,7 @@
#include <linux/jhash.h> #include <linux/jhash.h>
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include "percpu_freelist.h"
struct bucket { struct bucket {
struct hlist_head head; struct hlist_head head;
...@@ -22,6 +24,8 @@ struct bucket { ...@@ -22,6 +24,8 @@ struct bucket {
struct bpf_htab { struct bpf_htab {
struct bpf_map map; struct bpf_map map;
struct bucket *buckets; struct bucket *buckets;
void *elems;
struct pcpu_freelist freelist;
atomic_t count; /* number of elements in this hashtable */ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */ u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */ u32 elem_size; /* size of each element in bytes */
...@@ -29,15 +33,86 @@ struct bpf_htab { ...@@ -29,15 +33,86 @@ struct bpf_htab {
/* each htab element is struct htab_elem + key + value */ /* each htab element is struct htab_elem + key + value */
struct htab_elem { struct htab_elem {
union {
struct hlist_node hash_node; struct hlist_node hash_node;
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
};
struct rcu_head rcu; struct rcu_head rcu;
union {
u32 hash; u32 hash;
u32 key_size;
};
char key[0] __aligned(8); char key[0] __aligned(8);
}; };
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
*(void __percpu **)(l->key + key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{
return *(void __percpu **)(l->key + key_size);
}
static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
{
return (struct htab_elem *) (htab->elems + i * htab->elem_size);
}
static void htab_free_elems(struct bpf_htab *htab)
{
int i;
if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
goto free_elems;
for (i = 0; i < htab->map.max_entries; i++) {
void __percpu *pptr;
pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
htab->map.key_size);
free_percpu(pptr);
}
free_elems:
vfree(htab->elems);
}
static int prealloc_elems_and_freelist(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
if (!htab->elems)
return -ENOMEM;
if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH)
goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) {
u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr;
pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
if (!pptr)
goto free_elems;
htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
pptr);
}
skip_percpu_elems:
err = pcpu_freelist_init(&htab->freelist);
if (err)
goto free_elems;
pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size,
htab->map.max_entries);
return 0;
free_elems:
htab_free_elems(htab);
return err;
}
/* Called from syscall */ /* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr) static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{ {
...@@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -46,6 +121,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
int err, i; int err, i;
u64 cost; u64 cost;
if (attr->map_flags & ~BPF_F_NO_PREALLOC)
/* reserved bits should not be used */
return ERR_PTR(-EINVAL);
htab = kzalloc(sizeof(*htab), GFP_USER); htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab) if (!htab)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -55,6 +134,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.key_size = attr->key_size; htab->map.key_size = attr->key_size;
htab->map.value_size = attr->value_size; htab->map.value_size = attr->value_size;
htab->map.max_entries = attr->max_entries; htab->map.max_entries = attr->max_entries;
htab->map.map_flags = attr->map_flags;
/* check sanity of attributes. /* check sanity of attributes.
* value_size == 0 may be allowed in the future to use map as a set * value_size == 0 may be allowed in the future to use map as a set
...@@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -92,7 +172,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
if (percpu) if (percpu)
htab->elem_size += sizeof(void *); htab->elem_size += sizeof(void *);
else else
htab->elem_size += htab->map.value_size; htab->elem_size += round_up(htab->map.value_size, 8);
/* prevent zero size kmalloc and check for u32 overflow */ /* prevent zero size kmalloc and check for u32 overflow */
if (htab->n_buckets == 0 || if (htab->n_buckets == 0 ||
...@@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -112,6 +192,11 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
/* if map size is larger than memlock limit, reject it early */
err = bpf_map_precharge_memlock(htab->map.pages);
if (err)
goto free_htab;
err = -ENOMEM; err = -ENOMEM;
htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN); GFP_USER | __GFP_NOWARN);
...@@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -127,10 +212,16 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock); raw_spin_lock_init(&htab->buckets[i].lock);
} }
atomic_set(&htab->count, 0); if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
err = prealloc_elems_and_freelist(htab);
if (err)
goto free_buckets;
}
return &htab->map; return &htab->map;
free_buckets:
kvfree(htab->buckets);
free_htab: free_htab:
kfree(htab); kfree(htab);
return ERR_PTR(err); return ERR_PTR(err);
...@@ -249,42 +340,42 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) ...@@ -249,42 +340,42 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
} }
} }
/* itereated over all buckets and all elements */ /* iterated over all buckets and all elements */
return -ENOENT; return -ENOENT;
} }
static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr)
{
*(void __percpu **)(l->key + key_size) = pptr;
}
static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
{ {
return *(void __percpu **)(l->key + key_size); if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
} free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
static void htab_percpu_elem_free(struct htab_elem *l)
{
free_percpu(htab_elem_get_ptr(l, l->key_size));
kfree(l); kfree(l);
} }
static void htab_percpu_elem_free_rcu(struct rcu_head *head) static void htab_elem_free_rcu(struct rcu_head *head)
{ {
struct htab_elem *l = container_of(head, struct htab_elem, rcu); struct htab_elem *l = container_of(head, struct htab_elem, rcu);
struct bpf_htab *htab = l->htab;
htab_percpu_elem_free(l); /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
* we're calling kfree, otherwise deadlock is possible if kprobes
* are placed somewhere inside of slub
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
htab_elem_free(htab, l);
__this_cpu_dec(bpf_prog_active);
preempt_enable();
} }
static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{ {
if (percpu) { if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
l->key_size = key_size; pcpu_freelist_push(&htab->freelist, &l->fnode);
call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
} else { } else {
kfree_rcu(l, rcu); atomic_dec(&htab->count);
l->htab = htab;
call_rcu(&l->rcu, htab_elem_free_rcu);
} }
} }
...@@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, ...@@ -293,23 +384,39 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
bool percpu, bool onallcpus) bool percpu, bool onallcpus)
{ {
u32 size = htab->map.value_size; u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
struct htab_elem *l_new; struct htab_elem *l_new;
void __percpu *pptr; void __percpu *pptr;
if (prealloc) {
l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
if (!l_new)
return ERR_PTR(-E2BIG);
} else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
atomic_dec(&htab->count);
return ERR_PTR(-E2BIG);
}
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
if (!l_new) if (!l_new)
return NULL; return ERR_PTR(-ENOMEM);
}
memcpy(l_new->key, key, key_size); memcpy(l_new->key, key, key_size);
if (percpu) { if (percpu) {
/* round up value_size to 8 bytes */ /* round up value_size to 8 bytes */
size = round_up(size, 8); size = round_up(size, 8);
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
/* alloc_percpu zero-fills */ /* alloc_percpu zero-fills */
pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN); pptr = __alloc_percpu_gfp(size, 8,
GFP_ATOMIC | __GFP_NOWARN);
if (!pptr) { if (!pptr) {
kfree(l_new); kfree(l_new);
return NULL; return ERR_PTR(-ENOMEM);
}
} }
if (!onallcpus) { if (!onallcpus) {
...@@ -324,6 +431,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, ...@@ -324,6 +431,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
off += size; off += size;
} }
} }
if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr); htab_elem_set_ptr(l_new, key_size, pptr);
} else { } else {
memcpy(l_new->key + round_up(key_size, 8), value, size); memcpy(l_new->key + round_up(key_size, 8), value, size);
...@@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, ...@@ -336,12 +444,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
u64 map_flags) u64 map_flags)
{ {
if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
/* if elem with this 'key' doesn't exist and we've reached
* max_entries limit, fail insertion of new elem
*/
return -E2BIG;
if (l_old && map_flags == BPF_NOEXIST) if (l_old && map_flags == BPF_NOEXIST)
/* elem already exists */ /* elem already exists */
return -EEXIST; return -EEXIST;
...@@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -375,13 +477,6 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hash = htab_map_hash(key, key_size); hash = htab_map_hash(key, key_size);
/* allocate new element outside of the lock, since
* we're most likley going to insert it
*/
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
if (!l_new)
return -ENOMEM;
b = __select_bucket(htab, hash); b = __select_bucket(htab, hash);
head = &b->head; head = &b->head;
...@@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -394,21 +489,24 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
if (ret) if (ret)
goto err; goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new);
goto err;
}
/* add new element to the head of the list, so that /* add new element to the head of the list, so that
* concurrent search will find it before old elem * concurrent search will find it before old elem
*/ */
hlist_add_head_rcu(&l_new->hash_node, head); hlist_add_head_rcu(&l_new->hash_node, head);
if (l_old) { if (l_old) {
hlist_del_rcu(&l_old->hash_node); hlist_del_rcu(&l_old->hash_node);
kfree_rcu(l_old, rcu); free_htab_elem(htab, l_old);
} else {
atomic_inc(&htab->count);
} }
raw_spin_unlock_irqrestore(&b->lock, flags); ret = 0;
return 0;
err: err:
raw_spin_unlock_irqrestore(&b->lock, flags); raw_spin_unlock_irqrestore(&b->lock, flags);
kfree(l_new);
return ret; return ret;
} }
...@@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ...@@ -466,12 +564,11 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
} else { } else {
l_new = alloc_htab_elem(htab, key, value, key_size, l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus); hash, true, onallcpus);
if (!l_new) { if (IS_ERR(l_new)) {
ret = -ENOMEM; ret = PTR_ERR(l_new);
goto err; goto err;
} }
hlist_add_head_rcu(&l_new->hash_node, head); hlist_add_head_rcu(&l_new->hash_node, head);
atomic_inc(&htab->count);
} }
ret = 0; ret = 0;
err: err:
...@@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, ...@@ -489,7 +586,6 @@ static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
static int htab_map_delete_elem(struct bpf_map *map, void *key) static int htab_map_delete_elem(struct bpf_map *map, void *key)
{ {
struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
struct hlist_head *head; struct hlist_head *head;
struct bucket *b; struct bucket *b;
struct htab_elem *l; struct htab_elem *l;
...@@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ...@@ -511,8 +607,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
if (l) { if (l) {
hlist_del_rcu(&l->hash_node); hlist_del_rcu(&l->hash_node);
atomic_dec(&htab->count); free_htab_elem(htab, l);
free_htab_elem(l, percpu, key_size);
ret = 0; ret = 0;
} }
...@@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) ...@@ -531,17 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_for_each_entry_safe(l, n, head, hash_node) {
hlist_del_rcu(&l->hash_node); hlist_del_rcu(&l->hash_node);
atomic_dec(&htab->count); htab_elem_free(htab, l);
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
l->key_size = htab->map.key_size;
htab_percpu_elem_free(l);
} else {
kfree(l);
}
} }
} }
} }
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
static void htab_map_free(struct bpf_map *map) static void htab_map_free(struct bpf_map *map)
{ {
...@@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) ...@@ -554,10 +642,16 @@ static void htab_map_free(struct bpf_map *map)
*/ */
synchronize_rcu(); synchronize_rcu();
/* some of kfree_rcu() callbacks for elements of this map may not have /* some of free_htab_elem() callbacks for elements of this map may
* executed. It's ok. Proceed to free residual elements and map itself * not have executed. Wait for them.
*/ */
rcu_barrier();
if (htab->map.map_flags & BPF_F_NO_PREALLOC) {
delete_all_elements(htab); delete_all_elements(htab);
} else {
htab_free_elems(htab);
pcpu_freelist_destroy(&htab->freelist);
}
kvfree(htab->buckets); kvfree(htab->buckets);
kfree(htab); kfree(htab);
} }
......
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include "percpu_freelist.h"
int pcpu_freelist_init(struct pcpu_freelist *s)
{
int cpu;
s->freelist = alloc_percpu(struct pcpu_freelist_head);
if (!s->freelist)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu);
raw_spin_lock_init(&head->lock);
head->first = NULL;
}
return 0;
}
void pcpu_freelist_destroy(struct pcpu_freelist *s)
{
free_percpu(s->freelist);
}
static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
raw_spin_lock(&head->lock);
node->next = head->first;
head->first = node;
raw_spin_unlock(&head->lock);
}
void pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
__pcpu_freelist_push(head, node);
}
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems)
{
struct pcpu_freelist_head *head;
unsigned long flags;
int i, cpu, pcpu_entries;
pcpu_entries = nr_elems / num_possible_cpus() + 1;
i = 0;
/* disable irq to workaround lockdep false positive
* in bpf usage pcpu_freelist_populate() will never race
* with pcpu_freelist_push()
*/
local_irq_save(flags);
for_each_possible_cpu(cpu) {
again:
head = per_cpu_ptr(s->freelist, cpu);
__pcpu_freelist_push(head, buf);
i++;
buf += elem_size;
if (i == nr_elems)
break;
if (i % pcpu_entries)
goto again;
}
local_irq_restore(flags);
}
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
int orig_cpu, cpu;
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
head->first = node->next;
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
if (cpu == orig_cpu)
return NULL;
}
}
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef __PERCPU_FREELIST_H__
#define __PERCPU_FREELIST_H__
#include <linux/spinlock.h>
#include <linux/percpu.h>
struct pcpu_freelist_head {
struct pcpu_freelist_node *first;
raw_spinlock_t lock;
};
struct pcpu_freelist {
struct pcpu_freelist_head __percpu *freelist;
};
struct pcpu_freelist_node {
struct pcpu_freelist_node *next;
};
void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems);
int pcpu_freelist_init(struct pcpu_freelist *);
void pcpu_freelist_destroy(struct pcpu_freelist *s);
#endif
...@@ -10,9 +10,10 @@ ...@@ -10,9 +10,10 @@
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/stacktrace.h> #include <linux/stacktrace.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
#include "percpu_freelist.h"
struct stack_map_bucket { struct stack_map_bucket {
struct rcu_head rcu; struct pcpu_freelist_node fnode;
u32 hash; u32 hash;
u32 nr; u32 nr;
u64 ip[]; u64 ip[];
...@@ -20,10 +21,34 @@ struct stack_map_bucket { ...@@ -20,10 +21,34 @@ struct stack_map_bucket {
struct bpf_stack_map { struct bpf_stack_map {
struct bpf_map map; struct bpf_map map;
void *elems;
struct pcpu_freelist freelist;
u32 n_buckets; u32 n_buckets;
struct stack_map_bucket __rcu *buckets[]; struct stack_map_bucket *buckets[];
}; };
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
{
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
smap->elems = vzalloc(elem_size * smap->map.max_entries);
if (!smap->elems)
return -ENOMEM;
err = pcpu_freelist_init(&smap->freelist);
if (err)
goto free_elems;
pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size,
smap->map.max_entries);
return 0;
free_elems:
vfree(smap->elems);
return err;
}
/* Called from syscall */ /* Called from syscall */
static struct bpf_map *stack_map_alloc(union bpf_attr *attr) static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
{ {
...@@ -35,6 +60,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) ...@@ -35,6 +60,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (!capable(CAP_SYS_ADMIN)) if (!capable(CAP_SYS_ADMIN))
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
if (attr->map_flags)
return ERR_PTR(-EINVAL);
/* check sanity of attributes */ /* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 || if (attr->max_entries == 0 || attr->key_size != 4 ||
value_size < 8 || value_size % 8 || value_size < 8 || value_size % 8 ||
...@@ -67,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) ...@@ -67,12 +95,22 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
smap->n_buckets = n_buckets; smap->n_buckets = n_buckets;
smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
err = bpf_map_precharge_memlock(smap->map.pages);
if (err)
goto free_smap;
err = get_callchain_buffers(); err = get_callchain_buffers();
if (err) if (err)
goto free_smap; goto free_smap;
err = prealloc_elems_and_freelist(smap);
if (err)
goto put_buffers;
return &smap->map; return &smap->map;
put_buffers:
put_callchain_buffers();
free_smap: free_smap:
kvfree(smap); kvfree(smap);
return ERR_PTR(err); return ERR_PTR(err);
...@@ -118,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) ...@@ -118,7 +156,7 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
ips = trace->ip + skip + init_nr; ips = trace->ip + skip + init_nr;
hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0);
id = hash & (smap->n_buckets - 1); id = hash & (smap->n_buckets - 1);
bucket = rcu_dereference(smap->buckets[id]); bucket = READ_ONCE(smap->buckets[id]);
if (bucket && bucket->hash == hash) { if (bucket && bucket->hash == hash) {
if (flags & BPF_F_FAST_STACK_CMP) if (flags & BPF_F_FAST_STACK_CMP)
...@@ -132,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) ...@@ -132,19 +170,18 @@ static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5)
if (bucket && !(flags & BPF_F_REUSE_STACKID)) if (bucket && !(flags & BPF_F_REUSE_STACKID))
return -EEXIST; return -EEXIST;
new_bucket = kmalloc(sizeof(struct stack_map_bucket) + map->value_size, new_bucket = (struct stack_map_bucket *)
GFP_ATOMIC | __GFP_NOWARN); pcpu_freelist_pop(&smap->freelist);
if (unlikely(!new_bucket)) if (unlikely(!new_bucket))
return -ENOMEM; return -ENOMEM;
memcpy(new_bucket->ip, ips, trace_len); memcpy(new_bucket->ip, ips, trace_len);
memset(new_bucket->ip + trace_len / 8, 0, map->value_size - trace_len);
new_bucket->hash = hash; new_bucket->hash = hash;
new_bucket->nr = trace_nr; new_bucket->nr = trace_nr;
old_bucket = xchg(&smap->buckets[id], new_bucket); old_bucket = xchg(&smap->buckets[id], new_bucket);
if (old_bucket) if (old_bucket)
kfree_rcu(old_bucket, rcu); pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return id; return id;
} }
...@@ -157,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = { ...@@ -157,17 +194,34 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
.arg3_type = ARG_ANYTHING, .arg3_type = ARG_ANYTHING,
}; };
/* Called from syscall or from eBPF program */ /* Called from eBPF program */
static void *stack_map_lookup_elem(struct bpf_map *map, void *key) static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
{
return NULL;
}
/* Called from syscall */
int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{ {
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
struct stack_map_bucket *bucket; struct stack_map_bucket *bucket, *old_bucket;
u32 id = *(u32 *)key; u32 id = *(u32 *)key, trace_len;
if (unlikely(id >= smap->n_buckets)) if (unlikely(id >= smap->n_buckets))
return NULL; return -ENOENT;
bucket = rcu_dereference(smap->buckets[id]);
return bucket ? bucket->ip : NULL; bucket = xchg(&smap->buckets[id], NULL);
if (!bucket)
return -ENOENT;
trace_len = bucket->nr * sizeof(u64);
memcpy(value, bucket->ip, trace_len);
memset(value + trace_len, 0, map->value_size - trace_len);
old_bucket = xchg(&smap->buckets[id], bucket);
if (old_bucket)
pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0;
} }
static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
...@@ -193,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) ...@@ -193,7 +247,7 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
old_bucket = xchg(&smap->buckets[id], NULL); old_bucket = xchg(&smap->buckets[id], NULL);
if (old_bucket) { if (old_bucket) {
kfree_rcu(old_bucket, rcu); pcpu_freelist_push(&smap->freelist, &old_bucket->fnode);
return 0; return 0;
} else { } else {
return -ENOENT; return -ENOENT;
...@@ -204,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key) ...@@ -204,13 +258,12 @@ static int stack_map_delete_elem(struct bpf_map *map, void *key)
static void stack_map_free(struct bpf_map *map) static void stack_map_free(struct bpf_map *map)
{ {
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
int i;
/* wait for bpf programs to complete before freeing stack map */
synchronize_rcu(); synchronize_rcu();
for (i = 0; i < smap->n_buckets; i++) vfree(smap->elems);
if (smap->buckets[i]) pcpu_freelist_destroy(&smap->freelist);
kfree_rcu(smap->buckets[i], rcu);
kvfree(smap); kvfree(smap);
put_callchain_buffers(); put_callchain_buffers();
} }
......
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/version.h> #include <linux/version.h>
DEFINE_PER_CPU(int, bpf_prog_active);
int sysctl_unprivileged_bpf_disabled __read_mostly; int sysctl_unprivileged_bpf_disabled __read_mostly;
static LIST_HEAD(bpf_map_types); static LIST_HEAD(bpf_map_types);
...@@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) ...@@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types); list_add(&tl->list_node, &bpf_map_types);
} }
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
unsigned long memlock_limit, cur;
memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
cur = atomic_long_read(&user->locked_vm);
free_uid(user);
if (cur + pages > memlock_limit)
return -EPERM;
return 0;
}
static int bpf_map_charge_memlock(struct bpf_map *map) static int bpf_map_charge_memlock(struct bpf_map *map)
{ {
struct user_struct *user = get_current_user(); struct user_struct *user = get_current_user();
...@@ -151,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map) ...@@ -151,7 +166,7 @@ int bpf_map_new_fd(struct bpf_map *map)
offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
sizeof(attr->CMD##_LAST_FIELD)) != NULL sizeof(attr->CMD##_LAST_FIELD)) != NULL
#define BPF_MAP_CREATE_LAST_FIELD max_entries #define BPF_MAP_CREATE_LAST_FIELD map_flags
/* called via syscall */ /* called via syscall */
static int map_create(union bpf_attr *attr) static int map_create(union bpf_attr *attr)
{ {
...@@ -275,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -275,6 +290,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value); err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else { } else {
rcu_read_lock(); rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key); ptr = map->ops->map_lookup_elem(map, key);
...@@ -347,6 +364,11 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -347,6 +364,11 @@ static int map_update_elem(union bpf_attr *attr)
if (copy_from_user(value, uvalue, value_size) != 0) if (copy_from_user(value, uvalue, value_size) != 0)
goto free_value; goto free_value;
/* must increment bpf_prog_active to avoid kprobe+bpf triggering from
* inside bpf map update or delete otherwise deadlocks are possible
*/
preempt_disable();
__this_cpu_inc(bpf_prog_active);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags); err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
...@@ -356,6 +378,8 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -356,6 +378,8 @@ static int map_update_elem(union bpf_attr *attr)
err = map->ops->map_update_elem(map, key, value, attr->flags); err = map->ops->map_update_elem(map, key, value, attr->flags);
rcu_read_unlock(); rcu_read_unlock();
} }
__this_cpu_dec(bpf_prog_active);
preempt_enable();
free_value: free_value:
kfree(value); kfree(value);
...@@ -394,9 +418,13 @@ static int map_delete_elem(union bpf_attr *attr) ...@@ -394,9 +418,13 @@ static int map_delete_elem(union bpf_attr *attr)
if (copy_from_user(key, ukey, map->key_size) != 0) if (copy_from_user(key, ukey, map->key_size) != 0)
goto free_key; goto free_key;
preempt_disable();
__this_cpu_inc(bpf_prog_active);
rcu_read_lock(); rcu_read_lock();
err = map->ops->map_delete_elem(map, key); err = map->ops->map_delete_elem(map, key);
rcu_read_unlock(); rcu_read_unlock();
__this_cpu_dec(bpf_prog_active);
preempt_enable();
free_key: free_key:
kfree(key); kfree(key);
......
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
#include <linux/ctype.h> #include <linux/ctype.h>
#include "trace.h" #include "trace.h"
static DEFINE_PER_CPU(int, bpf_prog_active);
/** /**
* trace_call_bpf - invoke BPF program * trace_call_bpf - invoke BPF program
* @prog: BPF program * @prog: BPF program
......
...@@ -61,6 +61,7 @@ struct bpf_map_def { ...@@ -61,6 +61,7 @@ struct bpf_map_def {
unsigned int key_size; unsigned int key_size;
unsigned int value_size; unsigned int value_size;
unsigned int max_entries; unsigned int max_entries;
unsigned int map_flags;
}; };
static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) = static int (*bpf_skb_store_bytes)(void *ctx, int off, void *from, int len, int flags) =
......
...@@ -157,9 +157,13 @@ static int load_maps(struct bpf_map_def *maps, int len) ...@@ -157,9 +157,13 @@ static int load_maps(struct bpf_map_def *maps, int len)
map_fd[i] = bpf_create_map(maps[i].type, map_fd[i] = bpf_create_map(maps[i].type,
maps[i].key_size, maps[i].key_size,
maps[i].value_size, maps[i].value_size,
maps[i].max_entries); maps[i].max_entries,
if (map_fd[i] < 0) maps[i].map_flags);
if (map_fd[i] < 0) {
printf("failed to create a map: %d %s\n",
errno, strerror(errno));
return 1; return 1;
}
if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY)
prog_array_fd = map_fd[i]; prog_array_fd = map_fd[i];
...@@ -343,3 +347,65 @@ void read_trace_pipe(void) ...@@ -343,3 +347,65 @@ void read_trace_pipe(void)
} }
} }
} }
#define MAX_SYMS 300000
static struct ksym syms[MAX_SYMS];
static int sym_cnt;
static int ksym_cmp(const void *p1, const void *p2)
{
return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
}
int load_kallsyms(void)
{
FILE *f = fopen("/proc/kallsyms", "r");
char func[256], buf[256];
char symbol;
void *addr;
int i = 0;
if (!f)
return -ENOENT;
while (!feof(f)) {
if (!fgets(buf, sizeof(buf), f))
break;
if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
break;
if (!addr)
continue;
syms[i].addr = (long) addr;
syms[i].name = strdup(func);
i++;
}
sym_cnt = i;
qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
return 0;
}
struct ksym *ksym_search(long key)
{
int start = 0, end = sym_cnt;
int result;
while (start < end) {
size_t mid = start + (end - start) / 2;
result = key - syms[mid].addr;
if (result < 0)
end = mid;
else if (result > 0)
start = mid + 1;
else
return &syms[mid];
}
if (start >= 1 && syms[start - 1].addr < key &&
key < syms[start].addr)
/* valid ksym */
return &syms[start - 1];
/* out of range. return _stext */
return &syms[0];
}
...@@ -23,5 +23,11 @@ extern int event_fd[MAX_PROGS]; ...@@ -23,5 +23,11 @@ extern int event_fd[MAX_PROGS];
int load_bpf_file(char *path); int load_bpf_file(char *path);
void read_trace_pipe(void); void read_trace_pipe(void);
struct ksym {
long addr;
char *name;
};
int load_kallsyms(void);
struct ksym *ksym_search(long key);
#endif #endif
...@@ -44,7 +44,7 @@ static void usage(void) ...@@ -44,7 +44,7 @@ static void usage(void)
static int bpf_map_create(void) static int bpf_map_create(void)
{ {
return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t), return bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(uint32_t),
sizeof(uint32_t), 1024); sizeof(uint32_t), 1024, 0);
} }
static int bpf_prog_create(const char *object) static int bpf_prog_create(const char *object)
......
...@@ -19,13 +19,14 @@ static __u64 ptr_to_u64(void *ptr) ...@@ -19,13 +19,14 @@ static __u64 ptr_to_u64(void *ptr)
} }
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
int max_entries) int max_entries, int map_flags)
{ {
union bpf_attr attr = { union bpf_attr attr = {
.map_type = map_type, .map_type = map_type,
.key_size = key_size, .key_size = key_size,
.value_size = value_size, .value_size = value_size,
.max_entries = max_entries .max_entries = max_entries,
.map_flags = map_flags,
}; };
return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); return syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr));
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
struct bpf_insn; struct bpf_insn;
int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size, int bpf_create_map(enum bpf_map_type map_type, int key_size, int value_size,
int max_entries); int max_entries, int map_flags);
int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags); int bpf_update_elem(int fd, void *key, void *value, unsigned long long flags);
int bpf_lookup_elem(int fd, void *key, void *value); int bpf_lookup_elem(int fd, void *key, void *value);
int bpf_delete_elem(int fd, void *key); int bpf_delete_elem(int fd, void *key);
......
...@@ -18,80 +18,15 @@ ...@@ -18,80 +18,15 @@
#include "libbpf.h" #include "libbpf.h"
#include "bpf_load.h" #include "bpf_load.h"
#define MAX_SYMS 300000
#define PRINT_RAW_ADDR 0 #define PRINT_RAW_ADDR 0
static struct ksym {
long addr;
char *name;
} syms[MAX_SYMS];
static int sym_cnt;
static int ksym_cmp(const void *p1, const void *p2)
{
return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
}
static int load_kallsyms(void)
{
FILE *f = fopen("/proc/kallsyms", "r");
char func[256], buf[256];
char symbol;
void *addr;
int i = 0;
if (!f)
return -ENOENT;
while (!feof(f)) {
if (!fgets(buf, sizeof(buf), f))
break;
if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
break;
if (!addr)
continue;
syms[i].addr = (long) addr;
syms[i].name = strdup(func);
i++;
}
sym_cnt = i;
qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
return 0;
}
static void *search(long key)
{
int start = 0, end = sym_cnt;
int result;
while (start < end) {
size_t mid = start + (end - start) / 2;
result = key - syms[mid].addr;
if (result < 0)
end = mid;
else if (result > 0)
start = mid + 1;
else
return &syms[mid];
}
if (start >= 1 && syms[start - 1].addr < key &&
key < syms[start].addr)
/* valid ksym */
return &syms[start - 1];
/* out of range. return _stext */
return &syms[0];
}
static void print_ksym(__u64 addr) static void print_ksym(__u64 addr)
{ {
struct ksym *sym; struct ksym *sym;
if (!addr) if (!addr)
return; return;
sym = search(addr); sym = ksym_search(addr);
if (PRINT_RAW_ADDR) if (PRINT_RAW_ADDR)
printf("%s/%llx;", sym->name, addr); printf("%s/%llx;", sym->name, addr);
else else
......
...@@ -34,7 +34,7 @@ static int test_sock(void) ...@@ -34,7 +34,7 @@ static int test_sock(void)
long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; long long value = 0, tcp_cnt, udp_cnt, icmp_cnt;
map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
256); 256, 0);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create map '%s'\n", strerror(errno)); printf("failed to create map '%s'\n", strerror(errno));
goto cleanup; goto cleanup;
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
* Testsuite for eBPF maps * Testsuite for eBPF maps
* *
* Copyright (c) 2014 PLUMgrid, http://plumgrid.com * Copyright (c) 2014 PLUMgrid, http://plumgrid.com
* Copyright (c) 2016 Facebook
* *
* This program is free software; you can redistribute it and/or * This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public * modify it under the terms of version 2 of the GNU General Public
...@@ -17,13 +18,16 @@ ...@@ -17,13 +18,16 @@
#include <stdlib.h> #include <stdlib.h>
#include "libbpf.h" #include "libbpf.h"
static int map_flags;
/* sanity tests for map API */ /* sanity tests for map API */
static void test_hashmap_sanity(int i, void *data) static void test_hashmap_sanity(int i, void *data)
{ {
long long key, next_key, value; long long key, next_key, value;
int map_fd; int map_fd;
map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), 2); map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
2, map_flags);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create hashmap '%s'\n", strerror(errno)); printf("failed to create hashmap '%s'\n", strerror(errno));
exit(1); exit(1);
...@@ -99,7 +103,7 @@ static void test_percpu_hashmap_sanity(int task, void *data) ...@@ -99,7 +103,7 @@ static void test_percpu_hashmap_sanity(int task, void *data)
int map_fd, i; int map_fd, i;
map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key), map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_HASH, sizeof(key),
sizeof(value[0]), 2); sizeof(value[0]), 2, map_flags);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create hashmap '%s'\n", strerror(errno)); printf("failed to create hashmap '%s'\n", strerror(errno));
exit(1); exit(1);
...@@ -188,7 +192,8 @@ static void test_arraymap_sanity(int i, void *data) ...@@ -188,7 +192,8 @@ static void test_arraymap_sanity(int i, void *data)
int key, next_key, map_fd; int key, next_key, map_fd;
long long value; long long value;
map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 2); map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value),
2, 0);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create arraymap '%s'\n", strerror(errno)); printf("failed to create arraymap '%s'\n", strerror(errno));
exit(1); exit(1);
...@@ -244,7 +249,7 @@ static void test_percpu_arraymap_many_keys(void) ...@@ -244,7 +249,7 @@ static void test_percpu_arraymap_many_keys(void)
int key, map_fd, i; int key, map_fd, i;
map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key), map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
sizeof(values[0]), nr_keys); sizeof(values[0]), nr_keys, 0);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create per-cpu arraymap '%s'\n", printf("failed to create per-cpu arraymap '%s'\n",
strerror(errno)); strerror(errno));
...@@ -275,7 +280,7 @@ static void test_percpu_arraymap_sanity(int i, void *data) ...@@ -275,7 +280,7 @@ static void test_percpu_arraymap_sanity(int i, void *data)
int key, next_key, map_fd; int key, next_key, map_fd;
map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key), map_fd = bpf_create_map(BPF_MAP_TYPE_PERCPU_ARRAY, sizeof(key),
sizeof(values[0]), 2); sizeof(values[0]), 2, 0);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create arraymap '%s'\n", strerror(errno)); printf("failed to create arraymap '%s'\n", strerror(errno));
exit(1); exit(1);
...@@ -336,7 +341,7 @@ static void test_map_large(void) ...@@ -336,7 +341,7 @@ static void test_map_large(void)
/* allocate 4Mbyte of memory */ /* allocate 4Mbyte of memory */
map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
MAP_SIZE); MAP_SIZE, map_flags);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create large map '%s'\n", strerror(errno)); printf("failed to create large map '%s'\n", strerror(errno));
exit(1); exit(1);
...@@ -421,7 +426,7 @@ static void test_map_parallel(void) ...@@ -421,7 +426,7 @@ static void test_map_parallel(void)
int data[2]; int data[2];
map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value), map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(key), sizeof(value),
MAP_SIZE); MAP_SIZE, map_flags);
if (map_fd < 0) { if (map_fd < 0) {
printf("failed to create map for parallel test '%s'\n", printf("failed to create map for parallel test '%s'\n",
strerror(errno)); strerror(errno));
...@@ -463,7 +468,7 @@ static void test_map_parallel(void) ...@@ -463,7 +468,7 @@ static void test_map_parallel(void)
assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT); assert(bpf_get_next_key(map_fd, &key, &key) == -1 && errno == ENOENT);
} }
int main(void) static void run_all_tests(void)
{ {
test_hashmap_sanity(0, NULL); test_hashmap_sanity(0, NULL);
test_percpu_hashmap_sanity(0, NULL); test_percpu_hashmap_sanity(0, NULL);
...@@ -474,6 +479,14 @@ int main(void) ...@@ -474,6 +479,14 @@ int main(void)
test_map_large(); test_map_large();
test_map_parallel(); test_map_parallel();
test_map_stress(); test_map_stress();
}
int main(void)
{
map_flags = 0;
run_all_tests();
map_flags = BPF_F_NO_PREALLOC;
run_all_tests();
printf("test_maps: OK\n"); printf("test_maps: OK\n");
return 0; return 0;
} }
...@@ -1198,7 +1198,7 @@ static int create_map(void) ...@@ -1198,7 +1198,7 @@ static int create_map(void)
int map_fd; int map_fd;
map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
sizeof(long long), sizeof(long long), 1024); sizeof(long long), sizeof(long long), 1024, 0);
if (map_fd < 0) if (map_fd < 0)
printf("failed to create map '%s'\n", strerror(errno)); printf("failed to create map '%s'\n", strerror(errno));
...@@ -1210,7 +1210,7 @@ static int create_prog_array(void) ...@@ -1210,7 +1210,7 @@ static int create_prog_array(void)
int map_fd; int map_fd;
map_fd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY, map_fd = bpf_create_map(BPF_MAP_TYPE_PROG_ARRAY,
sizeof(int), sizeof(int), 4); sizeof(int), sizeof(int), 4, 0);
if (map_fd < 0) if (map_fd < 0)
printf("failed to create prog_array '%s'\n", strerror(errno)); printf("failed to create prog_array '%s'\n", strerror(errno));
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment