Commit 8c290e60 authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by David S. Miller

bpf: fix hashmap extra_elems logic

In both kmalloc and prealloc mode the bpf_map_update_elem() is using
per-cpu extra_elems to do atomic update when the map is full.
There are two issues with it. The logic can be misused, since it allows
max_entries+num_cpus elements to be present in the map. And alloc_extra_elems()
at map creation time can fail percpu alloc for large map values with a warn:
WARNING: CPU: 3 PID: 2752 at ../mm/percpu.c:892 pcpu_alloc+0x119/0xa60
illegal size (32824) or align (8) for percpu allocation

The fixes for both of these issues are different for kmalloc and prealloc modes.
For prealloc mode allocate extra num_possible_cpus elements and store
their pointers into extra_elems array instead of actual elements.
Hence we can use these hidden(spare) elements not only when the map is full
but during bpf_map_update_elem() that replaces existing element too.
That also improves performance, since pcpu_freelist_pop/push is avoided.
Unfortunately this approach cannot be used for kmalloc mode which needs
to kfree elements after rcu grace period. Therefore switch it back to normal
kmalloc even when full and old element exists like it was prior to
commit 6c905981 ("bpf: pre-allocate hash map elements").

Add tests to check for over max_entries and large map values.
Reported-by: default avatarDave Jones <davej@codemonkey.org.uk>
Fixes: 6c905981 ("bpf: pre-allocate hash map elements")
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Acked-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Acked-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent dd1ef791
...@@ -30,18 +30,12 @@ struct bpf_htab { ...@@ -30,18 +30,12 @@ struct bpf_htab {
struct pcpu_freelist freelist; struct pcpu_freelist freelist;
struct bpf_lru lru; struct bpf_lru lru;
}; };
void __percpu *extra_elems; struct htab_elem *__percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */ u32 n_buckets; /* number of hash buckets */
u32 elem_size; /* size of each element in bytes */ u32 elem_size; /* size of each element in bytes */
}; };
enum extra_elem_state {
HTAB_NOT_AN_EXTRA_ELEM = 0,
HTAB_EXTRA_ELEM_FREE,
HTAB_EXTRA_ELEM_USED
};
/* each htab element is struct htab_elem + key + value */ /* each htab element is struct htab_elem + key + value */
struct htab_elem { struct htab_elem {
union { union {
...@@ -56,7 +50,6 @@ struct htab_elem { ...@@ -56,7 +50,6 @@ struct htab_elem {
}; };
union { union {
struct rcu_head rcu; struct rcu_head rcu;
enum extra_elem_state state;
struct bpf_lru_node lru_node; struct bpf_lru_node lru_node;
}; };
u32 hash; u32 hash;
...@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab) ...@@ -77,6 +70,11 @@ static bool htab_is_percpu(const struct bpf_htab *htab)
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH; htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
} }
static bool htab_is_prealloc(const struct bpf_htab *htab)
{
return !(htab->map.map_flags & BPF_F_NO_PREALLOC);
}
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr) void __percpu *pptr)
{ {
...@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, ...@@ -128,17 +126,20 @@ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
static int prealloc_init(struct bpf_htab *htab) static int prealloc_init(struct bpf_htab *htab)
{ {
u32 num_entries = htab->map.max_entries;
int err = -ENOMEM, i; int err = -ENOMEM, i;
htab->elems = bpf_map_area_alloc(htab->elem_size * if (!htab_is_percpu(htab) && !htab_is_lru(htab))
htab->map.max_entries); num_entries += num_possible_cpus();
htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries);
if (!htab->elems) if (!htab->elems)
return -ENOMEM; return -ENOMEM;
if (!htab_is_percpu(htab)) if (!htab_is_percpu(htab))
goto skip_percpu_elems; goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) { for (i = 0; i < num_entries; i++) {
u32 size = round_up(htab->map.value_size, 8); u32 size = round_up(htab->map.value_size, 8);
void __percpu *pptr; void __percpu *pptr;
...@@ -166,11 +167,11 @@ static int prealloc_init(struct bpf_htab *htab) ...@@ -166,11 +167,11 @@ static int prealloc_init(struct bpf_htab *htab)
if (htab_is_lru(htab)) if (htab_is_lru(htab))
bpf_lru_populate(&htab->lru, htab->elems, bpf_lru_populate(&htab->lru, htab->elems,
offsetof(struct htab_elem, lru_node), offsetof(struct htab_elem, lru_node),
htab->elem_size, htab->map.max_entries); htab->elem_size, num_entries);
else else
pcpu_freelist_populate(&htab->freelist, pcpu_freelist_populate(&htab->freelist,
htab->elems + offsetof(struct htab_elem, fnode), htab->elems + offsetof(struct htab_elem, fnode),
htab->elem_size, htab->map.max_entries); htab->elem_size, num_entries);
return 0; return 0;
...@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab) ...@@ -191,16 +192,22 @@ static void prealloc_destroy(struct bpf_htab *htab)
static int alloc_extra_elems(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab)
{ {
void __percpu *pptr; struct htab_elem *__percpu *pptr, *l_new;
struct pcpu_freelist_node *l;
int cpu; int cpu;
pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN); pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8,
GFP_USER | __GFP_NOWARN);
if (!pptr) if (!pptr)
return -ENOMEM; return -ENOMEM;
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state = l = pcpu_freelist_pop(&htab->freelist);
HTAB_EXTRA_ELEM_FREE; /* pop will succeed, since prealloc_init()
* preallocated extra num_possible_cpus elements
*/
l_new = container_of(l, struct htab_elem, fnode);
*per_cpu_ptr(pptr, cpu) = l_new;
} }
htab->extra_elems = pptr; htab->extra_elems = pptr;
return 0; return 0;
...@@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -342,25 +349,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock); raw_spin_lock_init(&htab->buckets[i].lock);
} }
if (prealloc) {
err = prealloc_init(htab);
if (err)
goto free_buckets;
if (!percpu && !lru) { if (!percpu && !lru) {
/* lru itself can remove the least used element, so /* lru itself can remove the least used element, so
* there is no need for an extra elem during map_update. * there is no need for an extra elem during map_update.
*/ */
err = alloc_extra_elems(htab); err = alloc_extra_elems(htab);
if (err) if (err)
goto free_buckets; goto free_prealloc;
} }
if (prealloc) {
err = prealloc_init(htab);
if (err)
goto free_extra_elems;
} }
return &htab->map; return &htab->map;
free_extra_elems: free_prealloc:
free_percpu(htab->extra_elems); prealloc_destroy(htab);
free_buckets: free_buckets:
bpf_map_area_free(htab->buckets); bpf_map_area_free(htab->buckets);
free_htab: free_htab:
...@@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head) ...@@ -575,12 +582,7 @@ static void htab_elem_free_rcu(struct rcu_head *head)
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
{ {
if (l->state == HTAB_EXTRA_ELEM_USED) { if (htab_is_prealloc(htab)) {
l->state = HTAB_EXTRA_ELEM_FREE;
return;
}
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
pcpu_freelist_push(&htab->freelist, &l->fnode); pcpu_freelist_push(&htab->freelist, &l->fnode);
} else { } else {
atomic_dec(&htab->count); atomic_dec(&htab->count);
...@@ -610,48 +612,44 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, ...@@ -610,48 +612,44 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash, void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus, bool percpu, bool onallcpus,
bool old_elem_exists) struct htab_elem *old_elem)
{ {
u32 size = htab->map.value_size; u32 size = htab->map.value_size;
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new; struct htab_elem *l_new, **pl_new;
void __percpu *pptr; void __percpu *pptr;
int err = 0;
if (prealloc) { if (prealloc) {
if (old_elem) {
/* if we're updating the existing element,
* use per-cpu extra elems to avoid freelist_pop/push
*/
pl_new = this_cpu_ptr(htab->extra_elems);
l_new = *pl_new;
*pl_new = old_elem;
} else {
struct pcpu_freelist_node *l; struct pcpu_freelist_node *l;
l = pcpu_freelist_pop(&htab->freelist); l = pcpu_freelist_pop(&htab->freelist);
if (!l) if (!l)
err = -E2BIG; return ERR_PTR(-E2BIG);
else
l_new = container_of(l, struct htab_elem, fnode); l_new = container_of(l, struct htab_elem, fnode);
}
} else { } else {
if (atomic_inc_return(&htab->count) > htab->map.max_entries) { if (atomic_inc_return(&htab->count) > htab->map.max_entries)
if (!old_elem) {
/* when map is full and update() is replacing
* old element, it's ok to allocate, since
* old element will be freed immediately.
* Otherwise return an error
*/
atomic_dec(&htab->count); atomic_dec(&htab->count);
err = -E2BIG; return ERR_PTR(-E2BIG);
} else { }
l_new = kmalloc(htab->elem_size, l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
GFP_ATOMIC | __GFP_NOWARN);
if (!l_new) if (!l_new)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
}
if (err) {
if (!old_elem_exists)
return ERR_PTR(err);
/* if we're updating the existing element and the hash table
* is full, use per-cpu extra elems
*/
l_new = this_cpu_ptr(htab->extra_elems);
if (l_new->state != HTAB_EXTRA_ELEM_FREE)
return ERR_PTR(-E2BIG);
l_new->state = HTAB_EXTRA_ELEM_USED;
} else {
l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
}
memcpy(l_new->key, key, key_size); memcpy(l_new->key, key, key_size);
if (percpu) { if (percpu) {
...@@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -731,7 +729,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
goto err; goto err;
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
!!l_old); l_old);
if (IS_ERR(l_new)) { if (IS_ERR(l_new)) {
/* all pre-allocated elements are in use or memory exhausted */ /* all pre-allocated elements are in use or memory exhausted */
ret = PTR_ERR(l_new); ret = PTR_ERR(l_new);
...@@ -744,6 +742,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -744,6 +742,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
hlist_nulls_add_head_rcu(&l_new->hash_node, head); hlist_nulls_add_head_rcu(&l_new->hash_node, head);
if (l_old) { if (l_old) {
hlist_nulls_del_rcu(&l_old->hash_node); hlist_nulls_del_rcu(&l_old->hash_node);
if (!htab_is_prealloc(htab))
free_htab_elem(htab, l_old); free_htab_elem(htab, l_old);
} }
ret = 0; ret = 0;
...@@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ...@@ -856,7 +855,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
value, onallcpus); value, onallcpus);
} else { } else {
l_new = alloc_htab_elem(htab, key, value, key_size, l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus, false); hash, true, onallcpus, NULL);
if (IS_ERR(l_new)) { if (IS_ERR(l_new)) {
ret = PTR_ERR(l_new); ret = PTR_ERR(l_new);
goto err; goto err;
...@@ -1024,7 +1023,6 @@ static void delete_all_elements(struct bpf_htab *htab) ...@@ -1024,7 +1023,6 @@ static void delete_all_elements(struct bpf_htab *htab)
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
hlist_nulls_del_rcu(&l->hash_node); hlist_nulls_del_rcu(&l->hash_node);
if (l->state != HTAB_EXTRA_ELEM_USED)
htab_elem_free(htab, l); htab_elem_free(htab, l);
} }
} }
...@@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map) ...@@ -1045,7 +1043,7 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them. * not have executed. Wait for them.
*/ */
rcu_barrier(); rcu_barrier();
if (htab->map.map_flags & BPF_F_NO_PREALLOC) if (!htab_is_prealloc(htab))
delete_all_elements(htab); delete_all_elements(htab);
else else
prealloc_destroy(htab); prealloc_destroy(htab);
......
...@@ -80,8 +80,9 @@ static void test_hashmap(int task, void *data) ...@@ -80,8 +80,9 @@ static void test_hashmap(int task, void *data)
assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0); assert(bpf_map_update_elem(fd, &key, &value, BPF_EXIST) == 0);
key = 2; key = 2;
assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0);
key = 1; key = 3;
assert(bpf_map_update_elem(fd, &key, &value, BPF_ANY) == 0); assert(bpf_map_update_elem(fd, &key, &value, BPF_NOEXIST) == -1 &&
errno == E2BIG);
/* Check that key = 0 doesn't exist. */ /* Check that key = 0 doesn't exist. */
key = 0; key = 0;
...@@ -110,6 +111,24 @@ static void test_hashmap(int task, void *data) ...@@ -110,6 +111,24 @@ static void test_hashmap(int task, void *data)
close(fd); close(fd);
} }
static void test_hashmap_sizes(int task, void *data)
{
int fd, i, j;
for (i = 1; i <= 512; i <<= 1)
for (j = 1; j <= 1 << 18; j <<= 1) {
fd = bpf_create_map(BPF_MAP_TYPE_HASH, i, j,
2, map_flags);
if (fd < 0) {
printf("Failed to create hashmap key=%d value=%d '%s'\n",
i, j, strerror(errno));
exit(1);
}
close(fd);
usleep(10); /* give kernel time to destroy */
}
}
static void test_hashmap_percpu(int task, void *data) static void test_hashmap_percpu(int task, void *data)
{ {
unsigned int nr_cpus = bpf_num_possible_cpus(); unsigned int nr_cpus = bpf_num_possible_cpus();
...@@ -317,7 +336,10 @@ static void test_arraymap_percpu(int task, void *data) ...@@ -317,7 +336,10 @@ static void test_arraymap_percpu(int task, void *data)
static void test_arraymap_percpu_many_keys(void) static void test_arraymap_percpu_many_keys(void)
{ {
unsigned int nr_cpus = bpf_num_possible_cpus(); unsigned int nr_cpus = bpf_num_possible_cpus();
unsigned int nr_keys = 20000; /* nr_keys is not too large otherwise the test stresses percpu
* allocator more than anything else
*/
unsigned int nr_keys = 2000;
long values[nr_cpus]; long values[nr_cpus];
int key, fd, i; int key, fd, i;
...@@ -419,6 +441,7 @@ static void test_map_stress(void) ...@@ -419,6 +441,7 @@ static void test_map_stress(void)
{ {
run_parallel(100, test_hashmap, NULL); run_parallel(100, test_hashmap, NULL);
run_parallel(100, test_hashmap_percpu, NULL); run_parallel(100, test_hashmap_percpu, NULL);
run_parallel(100, test_hashmap_sizes, NULL);
run_parallel(100, test_arraymap, NULL); run_parallel(100, test_arraymap, NULL);
run_parallel(100, test_arraymap_percpu, NULL); run_parallel(100, test_arraymap_percpu, NULL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment