Commit e6ca4f16 authored by David S. Miller's avatar David S. Miller

Merge branch 'bpf-lru'

Martin KaFai Lau says:

====================
bpf: LRU map

This patch set adds LRU map implementation to the existing BPF map
family.

The first few patches introduce the basic BPF LRU list
implementation.

The later patches introduce the LRU versions of the
existing BPF_MAP_TYPE_LRU_[PERCPU_]HASH maps by leveraging
the BPF LRU list.

v2:
- Added a percpu LRU list option which can be specified as
  a map attribute.

  [Note: percpu LRU list has nothing to do with the map's value]

- Removed the cpu variable from the struct bpf_lru_locallist
  since it is not needed.

- Changed the __bpf_lru_node_move_out to __bpf_lru_node_move_to_free in
  patch 1 to prepare the percpu LRU list in patch 2.

- Moved the test_lru_map under selftests

- Refactored a few things in the test codes
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents bb598c1b 5db58faf
...@@ -85,6 +85,8 @@ enum bpf_map_type { ...@@ -85,6 +85,8 @@ enum bpf_map_type {
BPF_MAP_TYPE_PERCPU_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY,
BPF_MAP_TYPE_STACK_TRACE, BPF_MAP_TYPE_STACK_TRACE,
BPF_MAP_TYPE_CGROUP_ARRAY, BPF_MAP_TYPE_CGROUP_ARRAY,
BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH,
}; };
enum bpf_prog_type { enum bpf_prog_type {
...@@ -106,6 +108,13 @@ enum bpf_prog_type { ...@@ -106,6 +108,13 @@ enum bpf_prog_type {
#define BPF_EXIST 2 /* update existing element */ #define BPF_EXIST 2 /* update existing element */
#define BPF_F_NO_PREALLOC (1U << 0) #define BPF_F_NO_PREALLOC (1U << 0)
/* Instead of having one common LRU list in the
* BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list
* which can scale and perform better.
* Note, the LRU nodes (including free nodes) cannot be moved
* across different LRU lists.
*/
#define BPF_F_NO_COMMON_LRU (1U << 1)
union bpf_attr { union bpf_attr {
struct { /* anonymous struct used by BPF_MAP_CREATE command */ struct { /* anonymous struct used by BPF_MAP_CREATE command */
......
obj-y := core.o obj-y := core.o
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o
ifeq ($(CONFIG_PERF_EVENTS),y) ifeq ($(CONFIG_PERF_EVENTS),y)
obj-$(CONFIG_BPF_SYSCALL) += stackmap.o obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
endif endif
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <linux/cpumask.h>
#include <linux/spinlock.h>
#include <linux/percpu.h>
#include "bpf_lru_list.h"
#define LOCAL_FREE_TARGET (128)
#define LOCAL_NR_SCANS LOCAL_FREE_TARGET
#define PERCPU_FREE_TARGET (16)
#define PERCPU_NR_SCANS PERCPU_FREE_TARGET
/* Helpers to get the local list index */
#define LOCAL_LIST_IDX(t) ((t) - BPF_LOCAL_LIST_T_OFFSET)
#define LOCAL_FREE_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_FREE)
#define LOCAL_PENDING_LIST_IDX LOCAL_LIST_IDX(BPF_LRU_LOCAL_LIST_T_PENDING)
#define IS_LOCAL_LIST_TYPE(t) ((t) >= BPF_LOCAL_LIST_T_OFFSET)
static int get_next_cpu(int cpu)
{
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = cpumask_first(cpu_possible_mask);
return cpu;
}
/* Local list helpers */
static struct list_head *local_free_list(struct bpf_lru_locallist *loc_l)
{
return &loc_l->lists[LOCAL_FREE_LIST_IDX];
}
static struct list_head *local_pending_list(struct bpf_lru_locallist *loc_l)
{
return &loc_l->lists[LOCAL_PENDING_LIST_IDX];
}
/* bpf_lru_node helpers */
static bool bpf_lru_node_is_ref(const struct bpf_lru_node *node)
{
return node->ref;
}
static void bpf_lru_list_count_inc(struct bpf_lru_list *l,
enum bpf_lru_list_type type)
{
if (type < NR_BPF_LRU_LIST_COUNT)
l->counts[type]++;
}
static void bpf_lru_list_count_dec(struct bpf_lru_list *l,
enum bpf_lru_list_type type)
{
if (type < NR_BPF_LRU_LIST_COUNT)
l->counts[type]--;
}
static void __bpf_lru_node_move_to_free(struct bpf_lru_list *l,
struct bpf_lru_node *node,
struct list_head *free_list,
enum bpf_lru_list_type tgt_free_type)
{
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
return;
/* If the removing node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
*/
if (&node->list == l->next_inactive_rotation)
l->next_inactive_rotation = l->next_inactive_rotation->prev;
bpf_lru_list_count_dec(l, node->type);
node->type = tgt_free_type;
list_move(&node->list, free_list);
}
/* Move nodes from local list to the LRU list */
static void __bpf_lru_node_move_in(struct bpf_lru_list *l,
struct bpf_lru_node *node,
enum bpf_lru_list_type tgt_type)
{
if (WARN_ON_ONCE(!IS_LOCAL_LIST_TYPE(node->type)) ||
WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
return;
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
node->ref = 0;
list_move(&node->list, &l->lists[tgt_type]);
}
/* Move nodes between or within active and inactive list (like
* active to inactive, inactive to active or tail of active back to
* the head of active).
*/
static void __bpf_lru_node_move(struct bpf_lru_list *l,
struct bpf_lru_node *node,
enum bpf_lru_list_type tgt_type)
{
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)) ||
WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(tgt_type)))
return;
if (node->type != tgt_type) {
bpf_lru_list_count_dec(l, node->type);
bpf_lru_list_count_inc(l, tgt_type);
node->type = tgt_type;
}
node->ref = 0;
/* If the moving node is the next_inactive_rotation candidate,
* move the next_inactive_rotation pointer also.
*/
if (&node->list == l->next_inactive_rotation)
l->next_inactive_rotation = l->next_inactive_rotation->prev;
list_move(&node->list, &l->lists[tgt_type]);
}
static bool bpf_lru_list_inactive_low(const struct bpf_lru_list *l)
{
return l->counts[BPF_LRU_LIST_T_INACTIVE] <
l->counts[BPF_LRU_LIST_T_ACTIVE];
}
/* Rotate the active list:
* 1. Start from tail
* 2. If the node has the ref bit set, it will be rotated
* back to the head of active list with the ref bit cleared.
* Give this node one more chance to survive in the active list.
* 3. If the ref bit is not set, move it to the head of the
* inactive list.
* 4. It will at most scan nr_scans nodes
*/
static void __bpf_lru_list_rotate_active(struct bpf_lru *lru,
struct bpf_lru_list *l)
{
struct list_head *active = &l->lists[BPF_LRU_LIST_T_ACTIVE];
struct bpf_lru_node *node, *tmp_node, *first_node;
unsigned int i = 0;
first_node = list_first_entry(active, struct bpf_lru_node, list);
list_for_each_entry_safe_reverse(node, tmp_node, active, list) {
if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
else
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
if (++i == lru->nr_scans || node == first_node)
break;
}
}
/* Rotate the inactive list. It starts from the next_inactive_rotation
* 1. If the node has ref bit set, it will be moved to the head
* of active list with the ref bit cleared.
* 2. If the node does not have ref bit set, it will leave it
* at its current location (i.e. do nothing) so that it can
* be considered during the next inactive_shrink.
* 3. It will at most scan nr_scans nodes
*/
static void __bpf_lru_list_rotate_inactive(struct bpf_lru *lru,
struct bpf_lru_list *l)
{
struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
struct list_head *cur, *next, *last;
struct bpf_lru_node *node;
unsigned int i = 0;
if (list_empty(inactive))
return;
last = l->next_inactive_rotation->next;
if (last == inactive)
last = last->next;
cur = l->next_inactive_rotation;
while (i < lru->nr_scans) {
if (cur == inactive) {
cur = cur->prev;
continue;
}
node = list_entry(cur, struct bpf_lru_node, list);
next = cur->prev;
if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
if (cur == last)
break;
cur = next;
i++;
}
l->next_inactive_rotation = next;
}
/* Shrink the inactive list. It starts from the tail of the
* inactive list and only move the nodes without the ref bit
* set to the designated free list.
*/
static unsigned int
__bpf_lru_list_shrink_inactive(struct bpf_lru *lru,
struct bpf_lru_list *l,
unsigned int tgt_nshrink,
struct list_head *free_list,
enum bpf_lru_list_type tgt_free_type)
{
struct list_head *inactive = &l->lists[BPF_LRU_LIST_T_INACTIVE];
struct bpf_lru_node *node, *tmp_node, *first_node;
unsigned int nshrinked = 0;
unsigned int i = 0;
first_node = list_first_entry(inactive, struct bpf_lru_node, list);
list_for_each_entry_safe_reverse(node, tmp_node, inactive, list) {
if (bpf_lru_node_is_ref(node)) {
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_ACTIVE);
} else if (lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
if (++nshrinked == tgt_nshrink)
break;
}
if (++i == lru->nr_scans)
break;
}
return nshrinked;
}
/* 1. Rotate the active list (if needed)
* 2. Always rotate the inactive list
*/
static void __bpf_lru_list_rotate(struct bpf_lru *lru, struct bpf_lru_list *l)
{
if (bpf_lru_list_inactive_low(l))
__bpf_lru_list_rotate_active(lru, l);
__bpf_lru_list_rotate_inactive(lru, l);
}
/* Calls __bpf_lru_list_shrink_inactive() to shrink some
* ref-bit-cleared nodes and move them to the designated
* free list.
*
* If it cannot get a free node after calling
* __bpf_lru_list_shrink_inactive(). It will just remove
* one node from either inactive or active list without
* honoring the ref-bit. It prefers inactive list to active
* list in this situation.
*/
static unsigned int __bpf_lru_list_shrink(struct bpf_lru *lru,
struct bpf_lru_list *l,
unsigned int tgt_nshrink,
struct list_head *free_list,
enum bpf_lru_list_type tgt_free_type)
{
struct bpf_lru_node *node, *tmp_node;
struct list_head *force_shrink_list;
unsigned int nshrinked;
nshrinked = __bpf_lru_list_shrink_inactive(lru, l, tgt_nshrink,
free_list, tgt_free_type);
if (nshrinked)
return nshrinked;
/* Do a force shrink by ignoring the reference bit */
if (!list_empty(&l->lists[BPF_LRU_LIST_T_INACTIVE]))
force_shrink_list = &l->lists[BPF_LRU_LIST_T_INACTIVE];
else
force_shrink_list = &l->lists[BPF_LRU_LIST_T_ACTIVE];
list_for_each_entry_safe_reverse(node, tmp_node, force_shrink_list,
list) {
if (lru->del_from_htab(lru->del_arg, node)) {
__bpf_lru_node_move_to_free(l, node, free_list,
tgt_free_type);
return 1;
}
}
return 0;
}
/* Flush the nodes from the local pending list to the LRU list */
static void __local_list_flush(struct bpf_lru_list *l,
struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node, *tmp_node;
list_for_each_entry_safe_reverse(node, tmp_node,
local_pending_list(loc_l), list) {
if (bpf_lru_node_is_ref(node))
__bpf_lru_node_move_in(l, node, BPF_LRU_LIST_T_ACTIVE);
else
__bpf_lru_node_move_in(l, node,
BPF_LRU_LIST_T_INACTIVE);
}
}
static void bpf_lru_list_push_free(struct bpf_lru_list *l,
struct bpf_lru_node *node)
{
unsigned long flags;
if (WARN_ON_ONCE(IS_LOCAL_LIST_TYPE(node->type)))
return;
raw_spin_lock_irqsave(&l->lock, flags);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
raw_spin_unlock_irqrestore(&l->lock, flags);
}
static void bpf_lru_list_pop_free_to_local(struct bpf_lru *lru,
struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_list *l = &lru->common_lru.lru_list;
struct bpf_lru_node *node, *tmp_node;
unsigned int nfree = 0;
raw_spin_lock(&l->lock);
__local_list_flush(l, loc_l);
__bpf_lru_list_rotate(lru, l);
list_for_each_entry_safe(node, tmp_node, &l->lists[BPF_LRU_LIST_T_FREE],
list) {
__bpf_lru_node_move_to_free(l, node, local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
if (++nfree == LOCAL_FREE_TARGET)
break;
}
if (nfree < LOCAL_FREE_TARGET)
__bpf_lru_list_shrink(lru, l, LOCAL_FREE_TARGET - nfree,
local_free_list(loc_l),
BPF_LRU_LOCAL_LIST_T_FREE);
raw_spin_unlock(&l->lock);
}
static void __local_list_add_pending(struct bpf_lru *lru,
struct bpf_lru_locallist *loc_l,
int cpu,
struct bpf_lru_node *node,
u32 hash)
{
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->cpu = cpu;
node->type = BPF_LRU_LOCAL_LIST_T_PENDING;
node->ref = 0;
list_add(&node->list, local_pending_list(loc_l));
}
struct bpf_lru_node *__local_list_pop_free(struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
node = list_first_entry_or_null(local_free_list(loc_l),
struct bpf_lru_node,
list);
if (node)
list_del(&node->list);
return node;
}
struct bpf_lru_node *__local_list_pop_pending(struct bpf_lru *lru,
struct bpf_lru_locallist *loc_l)
{
struct bpf_lru_node *node;
bool force = false;
ignore_ref:
/* Get from the tail (i.e. older element) of the pending list. */
list_for_each_entry_reverse(node, local_pending_list(loc_l),
list) {
if ((!bpf_lru_node_is_ref(node) || force) &&
lru->del_from_htab(lru->del_arg, node)) {
list_del(&node->list);
return node;
}
}
if (!force) {
force = true;
goto ignore_ref;
}
return NULL;
}
static struct bpf_lru_node *bpf_percpu_lru_pop_free(struct bpf_lru *lru,
u32 hash)
{
struct list_head *free_list;
struct bpf_lru_node *node = NULL;
struct bpf_lru_list *l;
unsigned long flags;
int cpu = raw_smp_processor_id();
l = per_cpu_ptr(lru->percpu_lru, cpu);
raw_spin_lock_irqsave(&l->lock, flags);
__bpf_lru_list_rotate(lru, l);
free_list = &l->lists[BPF_LRU_LIST_T_FREE];
if (list_empty(free_list))
__bpf_lru_list_shrink(lru, l, PERCPU_FREE_TARGET, free_list,
BPF_LRU_LIST_T_FREE);
if (!list_empty(free_list)) {
node = list_first_entry(free_list, struct bpf_lru_node, list);
*(u32 *)((void *)node + lru->hash_offset) = hash;
node->ref = 0;
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_INACTIVE);
}
raw_spin_unlock_irqrestore(&l->lock, flags);
return node;
}
static struct bpf_lru_node *bpf_common_lru_pop_free(struct bpf_lru *lru,
u32 hash)
{
struct bpf_lru_locallist *loc_l, *steal_loc_l;
struct bpf_common_lru *clru = &lru->common_lru;
struct bpf_lru_node *node;
int steal, first_steal;
unsigned long flags;
int cpu = raw_smp_processor_id();
loc_l = per_cpu_ptr(clru->local_list, cpu);
raw_spin_lock_irqsave(&loc_l->lock, flags);
node = __local_list_pop_free(loc_l);
if (!node) {
bpf_lru_list_pop_free_to_local(lru, loc_l);
node = __local_list_pop_free(loc_l);
}
if (node)
__local_list_add_pending(lru, loc_l, cpu, node, hash);
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
if (node)
return node;
/* No free nodes found from the local free list and
* the global LRU list.
*
* Steal from the local free/pending list of the
* current CPU and remote CPU in RR. It starts
* with the loc_l->next_steal CPU.
*/
first_steal = loc_l->next_steal;
steal = first_steal;
do {
steal_loc_l = per_cpu_ptr(clru->local_list, steal);
raw_spin_lock_irqsave(&steal_loc_l->lock, flags);
node = __local_list_pop_free(steal_loc_l);
if (!node)
node = __local_list_pop_pending(lru, steal_loc_l);
raw_spin_unlock_irqrestore(&steal_loc_l->lock, flags);
steal = get_next_cpu(steal);
} while (!node && steal != first_steal);
loc_l->next_steal = steal;
if (node) {
raw_spin_lock_irqsave(&loc_l->lock, flags);
__local_list_add_pending(lru, loc_l, cpu, node, hash);
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
}
return node;
}
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash)
{
if (lru->percpu)
return bpf_percpu_lru_pop_free(lru, hash);
else
return bpf_common_lru_pop_free(lru, hash);
}
static void bpf_common_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
unsigned long flags;
if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) ||
WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE))
return;
if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu);
raw_spin_lock_irqsave(&loc_l->lock, flags);
if (unlikely(node->type != BPF_LRU_LOCAL_LIST_T_PENDING)) {
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
goto check_lru_list;
}
node->type = BPF_LRU_LOCAL_LIST_T_FREE;
node->ref = 0;
list_move(&node->list, local_free_list(loc_l));
raw_spin_unlock_irqrestore(&loc_l->lock, flags);
return;
}
check_lru_list:
bpf_lru_list_push_free(&lru->common_lru.lru_list, node);
}
static void bpf_percpu_lru_push_free(struct bpf_lru *lru,
struct bpf_lru_node *node)
{
struct bpf_lru_list *l;
unsigned long flags;
l = per_cpu_ptr(lru->percpu_lru, node->cpu);
raw_spin_lock_irqsave(&l->lock, flags);
__bpf_lru_node_move(l, node, BPF_LRU_LIST_T_FREE);
raw_spin_unlock_irqrestore(&l->lock, flags);
}
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node)
{
if (lru->percpu)
bpf_percpu_lru_push_free(lru, node);
else
bpf_common_lru_push_free(lru, node);
}
void bpf_common_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
u32 elem_size, u32 nr_elems)
{
struct bpf_lru_list *l = &lru->common_lru.lru_list;
u32 i;
for (i = 0; i < nr_elems; i++) {
struct bpf_lru_node *node;
node = (struct bpf_lru_node *)(buf + node_offset);
node->type = BPF_LRU_LIST_T_FREE;
node->ref = 0;
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
buf += elem_size;
}
}
void bpf_percpu_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
u32 elem_size, u32 nr_elems)
{
u32 i, pcpu_entries;
int cpu;
struct bpf_lru_list *l;
pcpu_entries = nr_elems / num_possible_cpus();
i = 0;
for_each_possible_cpu(cpu) {
struct bpf_lru_node *node;
l = per_cpu_ptr(lru->percpu_lru, cpu);
again:
node = (struct bpf_lru_node *)(buf + node_offset);
node->cpu = cpu;
node->type = BPF_LRU_LIST_T_FREE;
node->ref = 0;
list_add(&node->list, &l->lists[BPF_LRU_LIST_T_FREE]);
i++;
buf += elem_size;
if (i == nr_elems)
break;
if (i % pcpu_entries)
goto again;
}
}
void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
u32 elem_size, u32 nr_elems)
{
if (lru->percpu)
bpf_percpu_lru_populate(lru, buf, node_offset, elem_size,
nr_elems);
else
bpf_common_lru_populate(lru, buf, node_offset, elem_size,
nr_elems);
}
static void bpf_lru_locallist_init(struct bpf_lru_locallist *loc_l, int cpu)
{
int i;
for (i = 0; i < NR_BPF_LRU_LOCAL_LIST_T; i++)
INIT_LIST_HEAD(&loc_l->lists[i]);
loc_l->next_steal = cpu;
raw_spin_lock_init(&loc_l->lock);
}
static void bpf_lru_list_init(struct bpf_lru_list *l)
{
int i;
for (i = 0; i < NR_BPF_LRU_LIST_T; i++)
INIT_LIST_HEAD(&l->lists[i]);
for (i = 0; i < NR_BPF_LRU_LIST_COUNT; i++)
l->counts[i] = 0;
l->next_inactive_rotation = &l->lists[BPF_LRU_LIST_T_INACTIVE];
raw_spin_lock_init(&l->lock);
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
del_from_htab_func del_from_htab, void *del_arg)
{
int cpu;
if (percpu) {
lru->percpu_lru = alloc_percpu(struct bpf_lru_list);
if (!lru->percpu_lru)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct bpf_lru_list *l;
l = per_cpu_ptr(lru->percpu_lru, cpu);
bpf_lru_list_init(l);
}
lru->nr_scans = PERCPU_NR_SCANS;
} else {
struct bpf_common_lru *clru = &lru->common_lru;
clru->local_list = alloc_percpu(struct bpf_lru_locallist);
if (!clru->local_list)
return -ENOMEM;
for_each_possible_cpu(cpu) {
struct bpf_lru_locallist *loc_l;
loc_l = per_cpu_ptr(clru->local_list, cpu);
bpf_lru_locallist_init(loc_l, cpu);
}
bpf_lru_list_init(&clru->lru_list);
lru->nr_scans = LOCAL_NR_SCANS;
}
lru->percpu = percpu;
lru->del_from_htab = del_from_htab;
lru->del_arg = del_arg;
lru->hash_offset = hash_offset;
return 0;
}
void bpf_lru_destroy(struct bpf_lru *lru)
{
if (lru->percpu)
free_percpu(lru->percpu_lru);
else
free_percpu(lru->common_lru.local_list);
}
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#ifndef __BPF_LRU_LIST_H_
#define __BPF_LRU_LIST_H_
#include <linux/list.h>
#include <linux/spinlock_types.h>
#define NR_BPF_LRU_LIST_T (3)
#define NR_BPF_LRU_LIST_COUNT (2)
#define NR_BPF_LRU_LOCAL_LIST_T (2)
#define BPF_LOCAL_LIST_T_OFFSET NR_BPF_LRU_LIST_T
enum bpf_lru_list_type {
BPF_LRU_LIST_T_ACTIVE,
BPF_LRU_LIST_T_INACTIVE,
BPF_LRU_LIST_T_FREE,
BPF_LRU_LOCAL_LIST_T_FREE,
BPF_LRU_LOCAL_LIST_T_PENDING,
};
struct bpf_lru_node {
struct list_head list;
u16 cpu;
u8 type;
u8 ref;
};
struct bpf_lru_list {
struct list_head lists[NR_BPF_LRU_LIST_T];
unsigned int counts[NR_BPF_LRU_LIST_COUNT];
/* The next inacitve list rotation starts from here */
struct list_head *next_inactive_rotation;
raw_spinlock_t lock ____cacheline_aligned_in_smp;
};
struct bpf_lru_locallist {
struct list_head lists[NR_BPF_LRU_LOCAL_LIST_T];
u16 next_steal;
raw_spinlock_t lock;
};
struct bpf_common_lru {
struct bpf_lru_list lru_list;
struct bpf_lru_locallist __percpu *local_list;
};
typedef bool (*del_from_htab_func)(void *arg, struct bpf_lru_node *node);
struct bpf_lru {
union {
struct bpf_common_lru common_lru;
struct bpf_lru_list __percpu *percpu_lru;
};
del_from_htab_func del_from_htab;
void *del_arg;
unsigned int hash_offset;
unsigned int nr_scans;
bool percpu;
};
static inline void bpf_lru_node_set_ref(struct bpf_lru_node *node)
{
/* ref is an approximation on access frequency. It does not
* have to be very accurate. Hence, no protection is used.
*/
node->ref = 1;
}
int bpf_lru_init(struct bpf_lru *lru, bool percpu, u32 hash_offset,
del_from_htab_func del_from_htab, void *delete_arg);
void bpf_lru_populate(struct bpf_lru *lru, void *buf, u32 node_offset,
u32 elem_size, u32 nr_elems);
void bpf_lru_destroy(struct bpf_lru *lru);
struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash);
void bpf_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node);
void bpf_lru_promote(struct bpf_lru *lru, struct bpf_lru_node *node);
#endif
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <linux/filter.h> #include <linux/filter.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include "percpu_freelist.h" #include "percpu_freelist.h"
#include "bpf_lru_list.h"
struct bucket { struct bucket {
struct hlist_head head; struct hlist_head head;
...@@ -25,7 +26,10 @@ struct bpf_htab { ...@@ -25,7 +26,10 @@ struct bpf_htab {
struct bpf_map map; struct bpf_map map;
struct bucket *buckets; struct bucket *buckets;
void *elems; void *elems;
struct pcpu_freelist freelist; union {
struct pcpu_freelist freelist;
struct bpf_lru lru;
};
void __percpu *extra_elems; void __percpu *extra_elems;
atomic_t count; /* number of elements in this hashtable */ atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */ u32 n_buckets; /* number of hash buckets */
...@@ -48,11 +52,26 @@ struct htab_elem { ...@@ -48,11 +52,26 @@ struct htab_elem {
union { union {
struct rcu_head rcu; struct rcu_head rcu;
enum extra_elem_state state; enum extra_elem_state state;
struct bpf_lru_node lru_node;
}; };
u32 hash; u32 hash;
char key[0] __aligned(8); char key[0] __aligned(8);
}; };
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
static bool htab_is_lru(const struct bpf_htab *htab)
{
return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
static bool htab_is_percpu(const struct bpf_htab *htab)
{
return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
}
static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
void __percpu *pptr) void __percpu *pptr)
{ {
...@@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab) ...@@ -73,7 +92,7 @@ static void htab_free_elems(struct bpf_htab *htab)
{ {
int i; int i;
if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) if (!htab_is_percpu(htab))
goto free_elems; goto free_elems;
for (i = 0; i < htab->map.max_entries; i++) { for (i = 0; i < htab->map.max_entries; i++) {
...@@ -87,7 +106,22 @@ static void htab_free_elems(struct bpf_htab *htab) ...@@ -87,7 +106,22 @@ static void htab_free_elems(struct bpf_htab *htab)
vfree(htab->elems); vfree(htab->elems);
} }
static int prealloc_elems_and_freelist(struct bpf_htab *htab) static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
u32 hash)
{
struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
struct htab_elem *l;
if (node) {
l = container_of(node, struct htab_elem, lru_node);
memcpy(l->key, key, htab->map.key_size);
return l;
}
return NULL;
}
static int prealloc_init(struct bpf_htab *htab)
{ {
int err = -ENOMEM, i; int err = -ENOMEM, i;
...@@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) ...@@ -95,7 +129,7 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
if (!htab->elems) if (!htab->elems)
return -ENOMEM; return -ENOMEM;
if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) if (!htab_is_percpu(htab))
goto skip_percpu_elems; goto skip_percpu_elems;
for (i = 0; i < htab->map.max_entries; i++) { for (i = 0; i < htab->map.max_entries; i++) {
...@@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) ...@@ -110,12 +144,27 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
} }
skip_percpu_elems: skip_percpu_elems:
err = pcpu_freelist_init(&htab->freelist); if (htab_is_lru(htab))
err = bpf_lru_init(&htab->lru,
htab->map.map_flags & BPF_F_NO_COMMON_LRU,
offsetof(struct htab_elem, hash) -
offsetof(struct htab_elem, lru_node),
htab_lru_map_delete_node,
htab);
else
err = pcpu_freelist_init(&htab->freelist);
if (err) if (err)
goto free_elems; goto free_elems;
pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, if (htab_is_lru(htab))
htab->map.max_entries); bpf_lru_populate(&htab->lru, htab->elems,
offsetof(struct htab_elem, lru_node),
htab->elem_size, htab->map.max_entries);
else
pcpu_freelist_populate(&htab->freelist, htab->elems,
htab->elem_size, htab->map.max_entries);
return 0; return 0;
free_elems: free_elems:
...@@ -123,6 +172,16 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab) ...@@ -123,6 +172,16 @@ static int prealloc_elems_and_freelist(struct bpf_htab *htab)
return err; return err;
} }
static void prealloc_destroy(struct bpf_htab *htab)
{
htab_free_elems(htab);
if (htab_is_lru(htab))
bpf_lru_destroy(&htab->lru);
else
pcpu_freelist_destroy(&htab->freelist);
}
static int alloc_extra_elems(struct bpf_htab *htab) static int alloc_extra_elems(struct bpf_htab *htab)
{ {
void __percpu *pptr; void __percpu *pptr;
...@@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab) ...@@ -143,15 +202,37 @@ static int alloc_extra_elems(struct bpf_htab *htab)
/* Called from syscall */ /* Called from syscall */
static struct bpf_map *htab_map_alloc(union bpf_attr *attr) static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
{ {
bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
/* percpu_lru means each cpu has its own LRU list.
* it is different from BPF_MAP_TYPE_PERCPU_HASH where
* the map's value itself is percpu. percpu_lru has
* nothing to do with the map's value.
*/
bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
struct bpf_htab *htab; struct bpf_htab *htab;
int err, i; int err, i;
u64 cost; u64 cost;
if (attr->map_flags & ~BPF_F_NO_PREALLOC) if (lru && !capable(CAP_SYS_ADMIN))
/* LRU implementation is much complicated than other
* maps. Hence, limit to CAP_SYS_ADMIN for now.
*/
return ERR_PTR(-EPERM);
if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
/* reserved bits should not be used */ /* reserved bits should not be used */
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
if (!lru && percpu_lru)
return ERR_PTR(-EINVAL);
if (lru && !prealloc)
return ERR_PTR(-ENOTSUPP);
htab = kzalloc(sizeof(*htab), GFP_USER); htab = kzalloc(sizeof(*htab), GFP_USER);
if (!htab) if (!htab)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
...@@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -171,6 +252,18 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
htab->map.value_size == 0) htab->map.value_size == 0)
goto free_htab; goto free_htab;
if (percpu_lru) {
/* ensure each CPU's lru list has >=1 elements.
* since we are at it, make each lru list has the same
* number of elements.
*/
htab->map.max_entries = roundup(attr->max_entries,
num_possible_cpus());
if (htab->map.max_entries < attr->max_entries)
htab->map.max_entries = rounddown(attr->max_entries,
num_possible_cpus());
}
/* hash table size must be power of 2 */ /* hash table size must be power of 2 */
htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
...@@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) ...@@ -241,14 +334,17 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
raw_spin_lock_init(&htab->buckets[i].lock); raw_spin_lock_init(&htab->buckets[i].lock);
} }
if (!percpu) { if (!percpu && !lru) {
/* lru itself can remove the least used element, so
* there is no need for an extra elem during map_update.
*/
err = alloc_extra_elems(htab); err = alloc_extra_elems(htab);
if (err) if (err)
goto free_buckets; goto free_buckets;
} }
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { if (prealloc) {
err = prealloc_elems_and_freelist(htab); err = prealloc_init(htab);
if (err) if (err)
goto free_extra_elems; goto free_extra_elems;
} }
...@@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -323,6 +419,46 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
return NULL; return NULL;
} }
static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l) {
bpf_lru_node_set_ref(&l->lru_node);
return l->key + round_up(map->key_size, 8);
}
return NULL;
}
/* It is called from the bpf_lru_list when the LRU needs to delete
* older elements from the htab.
*/
static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
{
struct bpf_htab *htab = (struct bpf_htab *)arg;
struct htab_elem *l, *tgt_l;
struct hlist_head *head;
unsigned long flags;
struct bucket *b;
tgt_l = container_of(node, struct htab_elem, lru_node);
b = __select_bucket(htab, tgt_l->hash);
head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags);
hlist_for_each_entry_rcu(l, head, hash_node)
if (l == tgt_l) {
hlist_del_rcu(&l->hash_node);
break;
}
raw_spin_unlock_irqrestore(&b->lock, flags);
return l == tgt_l;
}
/* Called from syscall */ /* Called from syscall */
static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{ {
...@@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) ...@@ -420,6 +556,24 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
} }
} }
static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
void *value, bool onallcpus)
{
if (!onallcpus) {
/* copy true value_size bytes */
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
} else {
u32 size = round_up(htab->map.value_size, 8);
int off = 0, cpu;
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
value + off, size);
off += size;
}
}
}
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash, void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus, bool percpu, bool onallcpus,
...@@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, ...@@ -479,18 +633,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} }
} }
if (!onallcpus) { pcpu_copy_value(htab, pptr, value, onallcpus);
/* copy true value_size bytes */
memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
} else {
int off = 0, cpu;
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
value + off, size);
off += size;
}
}
if (!prealloc) if (!prealloc)
htab_elem_set_ptr(l_new, key_size, pptr); htab_elem_set_ptr(l_new, key_size, pptr);
} else { } else {
...@@ -571,6 +715,70 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, ...@@ -571,6 +715,70 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
return ret; return ret;
} }
static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old = NULL;
struct hlist_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
int ret;
if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
b = __select_bucket(htab, hash);
head = &b->head;
/* For LRU, we need to alloc before taking bucket's
* spinlock because getting free nodes from LRU may need
* to remove older elements from htab and this removal
* operation will need a bucket lock.
*/
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size);
ret = check_flags(htab, l_old, map_flags);
if (ret)
goto err;
/* add new element to the head of the list, so that
* concurrent search will find it before old elem
*/
hlist_add_head_rcu(&l_new->hash_node, head);
if (l_old) {
bpf_lru_node_set_ref(&l_new->lru_node);
hlist_del_rcu(&l_old->hash_node);
}
ret = 0;
err:
raw_spin_unlock_irqrestore(&b->lock, flags);
if (ret)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
else if (l_old)
bpf_lru_push_free(&htab->lru, &l_old->lru_node);
return ret;
}
static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags, void *value, u64 map_flags,
bool onallcpus) bool onallcpus)
...@@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ...@@ -606,22 +814,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
goto err; goto err;
if (l_old) { if (l_old) {
void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
u32 size = htab->map.value_size;
/* per-cpu hash map can update value in-place */ /* per-cpu hash map can update value in-place */
if (!onallcpus) { pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
memcpy(this_cpu_ptr(pptr), value, size); value, onallcpus);
} else {
int off = 0, cpu;
size = round_up(size, 8);
for_each_possible_cpu(cpu) {
bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
value + off, size);
off += size;
}
}
} else { } else {
l_new = alloc_htab_elem(htab, key, value, key_size, l_new = alloc_htab_elem(htab, key, value, key_size,
hash, true, onallcpus, false); hash, true, onallcpus, false);
...@@ -637,12 +832,84 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, ...@@ -637,12 +832,84 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
return ret; return ret;
} }
static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags,
bool onallcpus)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new = NULL, *l_old;
struct hlist_head *head;
unsigned long flags;
struct bucket *b;
u32 key_size, hash;
int ret;
if (unlikely(map_flags > BPF_EXIST))
/* unknown flags */
return -EINVAL;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
b = __select_bucket(htab, hash);
head = &b->head;
/* For LRU, we need to alloc before taking bucket's
* spinlock because LRU's elem alloc may need
* to remove older elem from htab and this removal
* operation will need a bucket lock.
*/
if (map_flags != BPF_EXIST) {
l_new = prealloc_lru_pop(htab, key, hash);
if (!l_new)
return -ENOMEM;
}
/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&b->lock, flags);
l_old = lookup_elem_raw(head, hash, key, key_size);
ret = check_flags(htab, l_old, map_flags);
if (ret)
goto err;
if (l_old) {
bpf_lru_node_set_ref(&l_old->lru_node);
/* per-cpu hash map can update value in-place */
pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
value, onallcpus);
} else {
pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
value, onallcpus);
hlist_add_head_rcu(&l_new->hash_node, head);
l_new = NULL;
}
ret = 0;
err:
raw_spin_unlock_irqrestore(&b->lock, flags);
if (l_new)
bpf_lru_push_free(&htab->lru, &l_new->lru_node);
return ret;
}
static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags) void *value, u64 map_flags)
{ {
return __htab_percpu_map_update_elem(map, key, value, map_flags, false); return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
} }
static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 map_flags)
{
return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
false);
}
/* Called from syscall or from eBPF program */ /* Called from syscall or from eBPF program */
static int htab_map_delete_elem(struct bpf_map *map, void *key) static int htab_map_delete_elem(struct bpf_map *map, void *key)
{ {
...@@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) ...@@ -676,6 +943,39 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
return ret; return ret;
} }
static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
int ret = -ENOENT;
WARN_ON_ONCE(!rcu_read_lock_held());
key_size = map->key_size;
hash = htab_map_hash(key, key_size);
b = __select_bucket(htab, hash);
head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags);
l = lookup_elem_raw(head, hash, key, key_size);
if (l) {
hlist_del_rcu(&l->hash_node);
ret = 0;
}
raw_spin_unlock_irqrestore(&b->lock, flags);
if (l)
bpf_lru_push_free(&htab->lru, &l->lru_node);
return ret;
}
static void delete_all_elements(struct bpf_htab *htab) static void delete_all_elements(struct bpf_htab *htab)
{ {
int i; int i;
...@@ -708,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map) ...@@ -708,12 +1008,11 @@ static void htab_map_free(struct bpf_map *map)
* not have executed. Wait for them. * not have executed. Wait for them.
*/ */
rcu_barrier(); rcu_barrier();
if (htab->map.map_flags & BPF_F_NO_PREALLOC) { if (htab->map.map_flags & BPF_F_NO_PREALLOC)
delete_all_elements(htab); delete_all_elements(htab);
} else { else
htab_free_elems(htab); prealloc_destroy(htab);
pcpu_freelist_destroy(&htab->freelist);
}
free_percpu(htab->extra_elems); free_percpu(htab->extra_elems);
kvfree(htab->buckets); kvfree(htab->buckets);
kfree(htab); kfree(htab);
...@@ -733,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = { ...@@ -733,6 +1032,20 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH, .type = BPF_MAP_TYPE_HASH,
}; };
static const struct bpf_map_ops htab_lru_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_map_lookup_elem,
.map_update_elem = htab_lru_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
};
static struct bpf_map_type_list htab_lru_type __read_mostly = {
.ops = &htab_lru_ops,
.type = BPF_MAP_TYPE_LRU_HASH,
};
/* Called from eBPF program */ /* Called from eBPF program */
static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{ {
...@@ -744,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) ...@@ -744,8 +1057,21 @@ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
return NULL; return NULL;
} }
static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
{
struct htab_elem *l = __htab_map_lookup_elem(map, key);
if (l) {
bpf_lru_node_set_ref(&l->lru_node);
return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
}
return NULL;
}
int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
{ {
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l; struct htab_elem *l;
void __percpu *pptr; void __percpu *pptr;
int ret = -ENOENT; int ret = -ENOENT;
...@@ -761,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) ...@@ -761,6 +1087,8 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
l = __htab_map_lookup_elem(map, key); l = __htab_map_lookup_elem(map, key);
if (!l) if (!l)
goto out; goto out;
if (htab_is_lru(htab))
bpf_lru_node_set_ref(&l->lru_node);
pptr = htab_elem_get_ptr(l, map->key_size); pptr = htab_elem_get_ptr(l, map->key_size);
for_each_possible_cpu(cpu) { for_each_possible_cpu(cpu) {
bpf_long_memcpy(value + off, bpf_long_memcpy(value + off,
...@@ -776,10 +1104,16 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) ...@@ -776,10 +1104,16 @@ int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
u64 map_flags) u64 map_flags)
{ {
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
int ret; int ret;
rcu_read_lock(); rcu_read_lock();
ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); if (htab_is_lru(htab))
ret = __htab_lru_percpu_map_update_elem(map, key, value,
map_flags, true);
else
ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
true);
rcu_read_unlock(); rcu_read_unlock();
return ret; return ret;
...@@ -799,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = { ...@@ -799,10 +1133,26 @@ static struct bpf_map_type_list htab_percpu_type __read_mostly = {
.type = BPF_MAP_TYPE_PERCPU_HASH, .type = BPF_MAP_TYPE_PERCPU_HASH,
}; };
static const struct bpf_map_ops htab_lru_percpu_ops = {
.map_alloc = htab_map_alloc,
.map_free = htab_map_free,
.map_get_next_key = htab_map_get_next_key,
.map_lookup_elem = htab_lru_percpu_map_lookup_elem,
.map_update_elem = htab_lru_percpu_map_update_elem,
.map_delete_elem = htab_lru_map_delete_elem,
};
static struct bpf_map_type_list htab_lru_percpu_type __read_mostly = {
.ops = &htab_lru_percpu_ops,
.type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
};
static int __init register_htab_map(void) static int __init register_htab_map(void)
{ {
bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_type);
bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_percpu_type);
bpf_register_map_type(&htab_lru_type);
bpf_register_map_type(&htab_lru_percpu_type);
return 0; return 0;
} }
late_initcall(register_htab_map); late_initcall(register_htab_map);
...@@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -292,6 +292,7 @@ static int map_lookup_elem(union bpf_attr *attr)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus(); value_size = round_up(map->value_size, 8) * num_possible_cpus();
else else
...@@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr) ...@@ -302,7 +303,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value) if (!value)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_copy(map, key, value); err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value); err = bpf_percpu_array_copy(map, key, value);
...@@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -366,6 +368,7 @@ static int map_update_elem(union bpf_attr *attr)
goto free_key; goto free_key;
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus(); value_size = round_up(map->value_size, 8) * num_possible_cpus();
else else
...@@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr) ...@@ -385,7 +388,8 @@ static int map_update_elem(union bpf_attr *attr)
*/ */
preempt_disable(); preempt_disable();
__this_cpu_inc(bpf_prog_active); __this_cpu_inc(bpf_prog_active);
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
err = bpf_percpu_hash_update(map, key, value, attr->flags); err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags); err = bpf_percpu_array_update(map, key, value, attr->flags);
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
obj- := dummy.o obj- := dummy.o
# List of programs to build # List of programs to build
hostprogs-y := test_lru_dist
hostprogs-y += sock_example hostprogs-y += sock_example
hostprogs-y += fds_example hostprogs-y += fds_example
hostprogs-y += sockex1 hostprogs-y += sockex1
...@@ -28,6 +29,7 @@ hostprogs-y += trace_event ...@@ -28,6 +29,7 @@ hostprogs-y += trace_event
hostprogs-y += sampleip hostprogs-y += sampleip
hostprogs-y += tc_l2_redirect hostprogs-y += tc_l2_redirect
test_lru_dist-objs := test_lru_dist.o libbpf.o
sock_example-objs := sock_example.o libbpf.o sock_example-objs := sock_example.o libbpf.o
fds_example-objs := bpf_load.o libbpf.o fds_example.o fds_example-objs := bpf_load.o libbpf.o fds_example.o
sockex1-objs := bpf_load.o libbpf.o sockex1_user.o sockex1-objs := bpf_load.o libbpf.o sockex1_user.o
......
...@@ -19,6 +19,21 @@ struct bpf_map_def SEC("maps") hash_map = { ...@@ -19,6 +19,21 @@ struct bpf_map_def SEC("maps") hash_map = {
.max_entries = MAX_ENTRIES, .max_entries = MAX_ENTRIES,
}; };
struct bpf_map_def SEC("maps") lru_hash_map = {
.type = BPF_MAP_TYPE_LRU_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(long),
.max_entries = 10000,
};
struct bpf_map_def SEC("maps") percpu_lru_hash_map = {
.type = BPF_MAP_TYPE_LRU_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(long),
.max_entries = 10000,
.map_flags = BPF_F_NO_COMMON_LRU,
};
struct bpf_map_def SEC("maps") percpu_hash_map = { struct bpf_map_def SEC("maps") percpu_hash_map = {
.type = BPF_MAP_TYPE_PERCPU_HASH, .type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(u32), .key_size = sizeof(u32),
...@@ -53,6 +68,7 @@ int stress_hmap(struct pt_regs *ctx) ...@@ -53,6 +68,7 @@ int stress_hmap(struct pt_regs *ctx)
value = bpf_map_lookup_elem(&hash_map, &key); value = bpf_map_lookup_elem(&hash_map, &key);
if (value) if (value)
bpf_map_delete_elem(&hash_map, &key); bpf_map_delete_elem(&hash_map, &key);
return 0; return 0;
} }
...@@ -96,5 +112,28 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) ...@@ -96,5 +112,28 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx)
bpf_map_delete_elem(&percpu_hash_map_alloc, &key); bpf_map_delete_elem(&percpu_hash_map_alloc, &key);
return 0; return 0;
} }
SEC("kprobe/sys_getpid")
int stress_lru_hmap_alloc(struct pt_regs *ctx)
{
u32 key = bpf_get_prandom_u32();
long val = 1;
bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY);
return 0;
}
SEC("kprobe/sys_getppid")
int stress_percpu_lru_hmap_alloc(struct pt_regs *ctx)
{
u32 key = bpf_get_prandom_u32();
long val = 1;
bpf_map_update_elem(&percpu_lru_hash_map, &key, &val, BPF_ANY);
return 0;
}
char _license[] SEC("license") = "GPL"; char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE; u32 _version SEC("version") = LINUX_VERSION_CODE;
...@@ -35,6 +35,8 @@ static __u64 time_get_ns(void) ...@@ -35,6 +35,8 @@ static __u64 time_get_ns(void)
#define PERCPU_HASH_PREALLOC (1 << 1) #define PERCPU_HASH_PREALLOC (1 << 1)
#define HASH_KMALLOC (1 << 2) #define HASH_KMALLOC (1 << 2)
#define PERCPU_HASH_KMALLOC (1 << 3) #define PERCPU_HASH_KMALLOC (1 << 3)
#define LRU_HASH_PREALLOC (1 << 4)
#define PERCPU_LRU_HASH_PREALLOC (1 << 5)
static int test_flags = ~0; static int test_flags = ~0;
...@@ -50,6 +52,30 @@ static void test_hash_prealloc(int cpu) ...@@ -50,6 +52,30 @@ static void test_hash_prealloc(int cpu)
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
} }
static void test_lru_hash_prealloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getpid);
printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_percpu_lru_hash_prealloc(int cpu)
{
__u64 start_time;
int i;
start_time = time_get_ns();
for (i = 0; i < MAX_CNT; i++)
syscall(__NR_getppid);
printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n",
cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
}
static void test_percpu_hash_prealloc(int cpu) static void test_percpu_hash_prealloc(int cpu)
{ {
__u64 start_time; __u64 start_time;
...@@ -105,6 +131,12 @@ static void loop(int cpu) ...@@ -105,6 +131,12 @@ static void loop(int cpu)
if (test_flags & PERCPU_HASH_KMALLOC) if (test_flags & PERCPU_HASH_KMALLOC)
test_percpu_hash_kmalloc(cpu); test_percpu_hash_kmalloc(cpu);
if (test_flags & LRU_HASH_PREALLOC)
test_lru_hash_prealloc(cpu);
if (test_flags & PERCPU_LRU_HASH_PREALLOC)
test_percpu_lru_hash_prealloc(cpu);
} }
static void run_perf_test(int tasks) static void run_perf_test(int tasks)
......
/*
* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#define _GNU_SOURCE
#include <linux/types.h>
#include <stdio.h>
#include <unistd.h>
#include <linux/bpf.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <sched.h>
#include <sys/wait.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <stdlib.h>
#include <time.h>
#include "libbpf.h"
#define min(a, b) ((a) < (b) ? (a) : (b))
#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER)
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
static int nr_cpus;
static unsigned long long *dist_keys;
static unsigned int dist_key_counts;
struct list_head {
struct list_head *next, *prev;
};
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
static inline int list_empty(const struct list_head *head)
{
return head->next == head;
}
static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
next->prev = new;
new->next = next;
new->prev = prev;
prev->next = new;
}
static inline void list_add(struct list_head *new, struct list_head *head)
{
__list_add(new, head, head->next);
}
static inline void __list_del(struct list_head *prev, struct list_head *next)
{
next->prev = prev;
prev->next = next;
}
static inline void __list_del_entry(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
}
static inline void list_move(struct list_head *list, struct list_head *head)
{
__list_del_entry(list);
list_add(list, head);
}
#define list_entry(ptr, type, member) \
container_of(ptr, type, member)
#define list_last_entry(ptr, type, member) \
list_entry((ptr)->prev, type, member)
struct pfect_lru_node {
struct list_head list;
unsigned long long key;
};
struct pfect_lru {
struct list_head list;
struct pfect_lru_node *free_nodes;
unsigned int cur_size;
unsigned int lru_size;
unsigned int nr_unique;
unsigned int nr_misses;
unsigned int total;
int map_fd;
};
static void pfect_lru_init(struct pfect_lru *lru, unsigned int lru_size,
unsigned int nr_possible_elems)
{
lru->map_fd = bpf_create_map(BPF_MAP_TYPE_HASH,
sizeof(unsigned long long),
sizeof(struct pfect_lru_node *),
nr_possible_elems, 0);
assert(lru->map_fd != -1);
lru->free_nodes = malloc(lru_size * sizeof(struct pfect_lru_node));
assert(lru->free_nodes);
INIT_LIST_HEAD(&lru->list);
lru->cur_size = 0;
lru->lru_size = lru_size;
lru->nr_unique = lru->nr_misses = lru->total = 0;
}
static void pfect_lru_destroy(struct pfect_lru *lru)
{
close(lru->map_fd);
free(lru->free_nodes);
}
static int pfect_lru_lookup_or_insert(struct pfect_lru *lru,
unsigned long long key)
{
struct pfect_lru_node *node = NULL;
int seen = 0;
lru->total++;
if (!bpf_lookup_elem(lru->map_fd, &key, &node)) {
if (node) {
list_move(&node->list, &lru->list);
return 1;
}
seen = 1;
}
if (lru->cur_size < lru->lru_size) {
node = &lru->free_nodes[lru->cur_size++];
INIT_LIST_HEAD(&node->list);
} else {
struct pfect_lru_node *null_node = NULL;
node = list_last_entry(&lru->list,
struct pfect_lru_node,
list);
bpf_update_elem(lru->map_fd, &node->key, &null_node, BPF_EXIST);
}
node->key = key;
list_move(&node->list, &lru->list);
lru->nr_misses++;
if (seen) {
assert(!bpf_update_elem(lru->map_fd, &key, &node, BPF_EXIST));
} else {
lru->nr_unique++;
assert(!bpf_update_elem(lru->map_fd, &key, &node, BPF_NOEXIST));
}
return seen;
}
static unsigned int read_keys(const char *dist_file,
unsigned long long **keys)
{
struct stat fst;
unsigned long long *retkeys;
unsigned int counts = 0;
int dist_fd;
char *b, *l;
int i;
dist_fd = open(dist_file, 0);
assert(dist_fd != -1);
assert(fstat(dist_fd, &fst) == 0);
b = malloc(fst.st_size);
assert(b);
assert(read(dist_fd, b, fst.st_size) == fst.st_size);
close(dist_fd);
for (i = 0; i < fst.st_size; i++) {
if (b[i] == '\n')
counts++;
}
counts++; /* in case the last line has no \n */
retkeys = malloc(counts * sizeof(unsigned long long));
assert(retkeys);
counts = 0;
for (l = strtok(b, "\n"); l; l = strtok(NULL, "\n"))
retkeys[counts++] = strtoull(l, NULL, 10);
free(b);
*keys = retkeys;
return counts;
}
static int create_map(int map_type, int map_flags, unsigned int size)
{
int map_fd;
map_fd = bpf_create_map(map_type, sizeof(unsigned long long),
sizeof(unsigned long long), size, map_flags);
if (map_fd == -1)
perror("bpf_create_map");
return map_fd;
}
static int sched_next_online(int pid, int next_to_try)
{
cpu_set_t cpuset;
if (next_to_try == nr_cpus)
return -1;
while (next_to_try < nr_cpus) {
CPU_ZERO(&cpuset);
CPU_SET(next_to_try++, &cpuset);
if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
break;
}
return next_to_try;
}
static void run_parallel(unsigned int tasks, void (*fn)(int i, void *data),
void *data)
{
int next_sched_cpu = 0;
pid_t pid[tasks];
int i;
for (i = 0; i < tasks; i++) {
pid[i] = fork();
if (pid[i] == 0) {
next_sched_cpu = sched_next_online(0, next_sched_cpu);
fn(i, data);
exit(0);
} else if (pid[i] == -1) {
printf("couldn't spawn #%d process\n", i);
exit(1);
}
/* It is mostly redundant and just allow the parent
* process to update next_shced_cpu for the next child
* process
*/
next_sched_cpu = sched_next_online(pid[i], next_sched_cpu);
}
for (i = 0; i < tasks; i++) {
int status;
assert(waitpid(pid[i], &status, 0) == pid[i]);
assert(status == 0);
}
}
static void do_test_lru_dist(int task, void *data)
{
unsigned int nr_misses = 0;
struct pfect_lru pfect_lru;
unsigned long long key, value = 1234;
unsigned int i;
unsigned int lru_map_fd = ((unsigned int *)data)[0];
unsigned int lru_size = ((unsigned int *)data)[1];
unsigned long long key_offset = task * dist_key_counts;
pfect_lru_init(&pfect_lru, lru_size, dist_key_counts);
for (i = 0; i < dist_key_counts; i++) {
key = dist_keys[i] + key_offset;
pfect_lru_lookup_or_insert(&pfect_lru, key);
if (!bpf_lookup_elem(lru_map_fd, &key, &value))
continue;
if (bpf_update_elem(lru_map_fd, &key, &value, BPF_NOEXIST)) {
printf("bpf_update_elem(lru_map_fd, %llu): errno:%d\n",
key, errno);
assert(0);
}
nr_misses++;
}
printf(" task:%d BPF LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
task, pfect_lru.nr_unique, dist_key_counts, nr_misses,
dist_key_counts);
printf(" task:%d Perfect LRU: nr_unique:%u(/%u) nr_misses:%u(/%u)\n",
task, pfect_lru.nr_unique, pfect_lru.total,
pfect_lru.nr_misses, pfect_lru.total);
pfect_lru_destroy(&pfect_lru);
close(lru_map_fd);
}
static void test_parallel_lru_dist(int map_type, int map_flags,
int nr_tasks, unsigned int lru_size)
{
int child_data[2];
int lru_map_fd;
printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
map_flags);
if (map_flags & BPF_F_NO_COMMON_LRU)
lru_map_fd = create_map(map_type, map_flags,
nr_cpus * lru_size);
else
lru_map_fd = create_map(map_type, map_flags,
nr_tasks * lru_size);
assert(lru_map_fd != -1);
child_data[0] = lru_map_fd;
child_data[1] = lru_size;
run_parallel(nr_tasks, do_test_lru_dist, child_data);
close(lru_map_fd);
}
static void test_lru_loss0(int map_type, int map_flags)
{
unsigned long long key, value[nr_cpus];
unsigned int old_unused_losses = 0;
unsigned int new_unused_losses = 0;
unsigned int used_losses = 0;
int map_fd;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
if (map_flags & BPF_F_NO_COMMON_LRU)
map_fd = create_map(map_type, map_flags, 900 * nr_cpus);
else
map_fd = create_map(map_type, map_flags, 900);
assert(map_fd != -1);
value[0] = 1234;
for (key = 1; key <= 1000; key++) {
int start_key, end_key;
assert(bpf_update_elem(map_fd, &key, value, BPF_NOEXIST) == 0);
start_key = 101;
end_key = min(key, 900);
while (start_key <= end_key) {
bpf_lookup_elem(map_fd, &start_key, value);
start_key++;
}
}
for (key = 1; key <= 1000; key++) {
if (bpf_lookup_elem(map_fd, &key, value)) {
if (key <= 100)
old_unused_losses++;
else if (key <= 900)
used_losses++;
else
new_unused_losses++;
}
}
close(map_fd);
printf("older-elem-losses:%d(/100) active-elem-losses:%d(/800) "
"newer-elem-losses:%d(/100)\n",
old_unused_losses, used_losses, new_unused_losses);
}
static void test_lru_loss1(int map_type, int map_flags)
{
unsigned long long key, value[nr_cpus];
int map_fd;
unsigned int nr_losses = 0;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
if (map_flags & BPF_F_NO_COMMON_LRU)
map_fd = create_map(map_type, map_flags, 1000 * nr_cpus);
else
map_fd = create_map(map_type, map_flags, 1000);
assert(map_fd != -1);
value[0] = 1234;
for (key = 1; key <= 1000; key++)
assert(!bpf_update_elem(map_fd, &key, value, BPF_NOEXIST));
for (key = 1; key <= 1000; key++) {
if (bpf_lookup_elem(map_fd, &key, value))
nr_losses++;
}
close(map_fd);
printf("nr_losses:%d(/1000)\n", nr_losses);
}
static void do_test_parallel_lru_loss(int task, void *data)
{
const unsigned int nr_stable_elems = 1000;
const unsigned int nr_repeats = 100000;
int map_fd = *(int *)data;
unsigned long long stable_base;
unsigned long long key, value[nr_cpus];
unsigned long long next_ins_key;
unsigned int nr_losses = 0;
unsigned int i;
stable_base = task * nr_repeats * 2 + 1;
next_ins_key = stable_base;
value[0] = 1234;
for (i = 0; i < nr_stable_elems; i++) {
assert(bpf_update_elem(map_fd, &next_ins_key, value,
BPF_NOEXIST) == 0);
next_ins_key++;
}
for (i = 0; i < nr_repeats; i++) {
int rn;
rn = rand();
if (rn % 10) {
key = rn % nr_stable_elems + stable_base;
bpf_lookup_elem(map_fd, &key, value);
} else {
bpf_update_elem(map_fd, &next_ins_key, value,
BPF_NOEXIST);
next_ins_key++;
}
}
key = stable_base;
for (i = 0; i < nr_stable_elems; i++) {
if (bpf_lookup_elem(map_fd, &key, value))
nr_losses++;
key++;
}
printf(" task:%d nr_losses:%u\n", task, nr_losses);
}
static void test_parallel_lru_loss(int map_type, int map_flags, int nr_tasks)
{
int map_fd;
printf("%s (map_type:%d map_flags:0x%X):\n", __func__, map_type,
map_flags);
/* Give 20% more than the active working set */
if (map_flags & BPF_F_NO_COMMON_LRU)
map_fd = create_map(map_type, map_flags,
nr_cpus * (1000 + 200));
else
map_fd = create_map(map_type, map_flags,
nr_tasks * (1000 + 200));
assert(map_fd != -1);
run_parallel(nr_tasks, do_test_parallel_lru_loss, &map_fd);
close(map_fd);
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
const char *dist_file;
int nr_tasks = 1;
int lru_size;
int f;
if (argc < 4) {
printf("Usage: %s <dist-file> <lru-size> <nr-tasks>\n",
argv[0]);
return -1;
}
dist_file = argv[1];
lru_size = atoi(argv[2]);
nr_tasks = atoi(argv[3]);
setbuf(stdout, NULL);
assert(!setrlimit(RLIMIT_MEMLOCK, &r));
srand(time(NULL));
nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
assert(nr_cpus != -1);
printf("nr_cpus:%d\n\n", nr_cpus);
nr_tasks = min(nr_tasks, nr_cpus);
dist_key_counts = read_keys(dist_file, &dist_keys);
if (!dist_key_counts) {
printf("%s has no key\n", dist_file);
return -1;
}
for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
test_lru_loss0(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
test_lru_loss1(BPF_MAP_TYPE_LRU_HASH, map_flags[f]);
test_parallel_lru_loss(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
nr_tasks);
test_parallel_lru_dist(BPF_MAP_TYPE_LRU_HASH, map_flags[f],
nr_tasks, lru_size);
printf("\n");
}
free(dist_keys);
return 0;
}
CFLAGS += -Wall -O2 CFLAGS += -Wall -O2 -I../../../../usr/include
test_objs = test_verifier test_maps test_objs = test_verifier test_maps test_lru_map
TEST_PROGS := test_verifier test_maps test_kmod.sh TEST_PROGS := test_verifier test_maps test_lru_map test_kmod.sh
TEST_FILES := $(test_objs) TEST_FILES := $(test_objs)
all: $(test_objs) all: $(test_objs)
......
/*
* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <assert.h>
#include <sched.h>
#include <sys/wait.h>
#include <stdlib.h>
#include <time.h>
#include "bpf_sys.h"
#define LOCAL_FREE_TARGET (128)
#define PERCPU_FREE_TARGET (16)
static int nr_cpus;
static int create_map(int map_type, int map_flags, unsigned int size)
{
int map_fd;
map_fd = bpf_map_create(map_type, sizeof(unsigned long long),
sizeof(unsigned long long), size, map_flags);
if (map_fd == -1)
perror("bpf_map_create");
return map_fd;
}
static int map_subset(int map0, int map1)
{
unsigned long long next_key = 0;
unsigned long long value0[nr_cpus], value1[nr_cpus];
int ret;
while (!bpf_map_next_key(map1, &next_key, &next_key)) {
assert(!bpf_map_lookup(map1, &next_key, value1));
ret = bpf_map_lookup(map0, &next_key, value0);
if (ret) {
printf("key:%llu not found from map. %s(%d)\n",
next_key, strerror(errno), errno);
return 0;
}
if (value0[0] != value1[0]) {
printf("key:%llu value0:%llu != value1:%llu\n",
next_key, value0[0], value1[0]);
return 0;
}
}
return 1;
}
static int map_equal(int lru_map, int expected)
{
return map_subset(lru_map, expected) && map_subset(expected, lru_map);
}
static int sched_next_online(int pid, int next_to_try)
{
cpu_set_t cpuset;
if (next_to_try == nr_cpus)
return -1;
while (next_to_try < nr_cpus) {
CPU_ZERO(&cpuset);
CPU_SET(next_to_try++, &cpuset);
if (!sched_setaffinity(pid, sizeof(cpuset), &cpuset))
break;
}
return next_to_try;
}
/* Size of the LRU amp is 2
* Add key=1 (+1 key)
* Add key=2 (+1 key)
* Lookup Key=1
* Add Key=3
* => Key=2 will be removed by LRU
* Iterate map. Only found key=1 and key=3
*/
static void test_lru_sanity0(int map_type, int map_flags)
{
unsigned long long key, value[nr_cpus];
int lru_map_fd, expected_map_fd;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
if (map_flags & BPF_F_NO_COMMON_LRU)
lru_map_fd = create_map(map_type, map_flags, 2 * nr_cpus);
else
lru_map_fd = create_map(map_type, map_flags, 2);
assert(lru_map_fd != -1);
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, 2);
assert(expected_map_fd != -1);
value[0] = 1234;
/* insert key=1 element */
key = 1;
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_update(expected_map_fd, &key, value, BPF_NOEXIST));
/* BPF_NOEXIST means: add new element if it doesn't exist */
assert(bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST) == -1 &&
/* key=1 already exists */
errno == EEXIST);
assert(bpf_map_update(lru_map_fd, &key, value, -1) == -1 &&
errno == EINVAL);
/* insert key=2 element */
/* check that key=2 is not found */
key = 2;
assert(bpf_map_lookup(lru_map_fd, &key, value) == -1 &&
errno == ENOENT);
/* BPF_EXIST means: update existing element */
assert(bpf_map_update(lru_map_fd, &key, value, BPF_EXIST) == -1 &&
/* key=2 is not there */
errno == ENOENT);
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
/* insert key=3 element */
/* check that key=3 is not found */
key = 3;
assert(bpf_map_lookup(lru_map_fd, &key, value) == -1 &&
errno == ENOENT);
/* check that key=1 can be found and mark the ref bit to
* stop LRU from removing key=1
*/
key = 1;
assert(!bpf_map_lookup(lru_map_fd, &key, value));
assert(value[0] == 1234);
key = 3;
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_update(expected_map_fd, &key, value, BPF_NOEXIST));
/* key=2 has been removed from the LRU */
key = 2;
assert(bpf_map_lookup(lru_map_fd, &key, value) == -1);
assert(map_equal(lru_map_fd, expected_map_fd));
close(expected_map_fd);
close(lru_map_fd);
printf("Pass\n");
}
/* Size of the LRU map is 1.5*tgt_free
* Insert 1 to tgt_free (+tgt_free keys)
* Lookup 1 to tgt_free/2
* Insert 1+tgt_free to 2*tgt_free (+tgt_free keys)
* => 1+tgt_free/2 to LOCALFREE_TARGET will be removed by LRU
*/
static void test_lru_sanity1(int map_type, int map_flags, unsigned int tgt_free)
{
unsigned long long key, end_key, value[nr_cpus];
int lru_map_fd, expected_map_fd;
unsigned int batch_size;
unsigned int map_size;
if (map_flags & BPF_F_NO_COMMON_LRU)
/* Ther percpu lru list (i.e each cpu has its own LRU
* list) does not have a local free list. Hence,
* it will only free old nodes till there is no free
* from the LRU list. Hence, this test does not apply
* to BPF_F_NO_COMMON_LRU
*/
return;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
batch_size = tgt_free / 2;
assert(batch_size * 2 == tgt_free);
map_size = tgt_free + batch_size;
lru_map_fd = create_map(map_type, map_flags, map_size);
assert(lru_map_fd != -1);
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
assert(expected_map_fd != -1);
value[0] = 1234;
/* Insert 1 to tgt_free (+tgt_free keys) */
end_key = 1 + tgt_free;
for (key = 1; key < end_key; key++)
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
/* Lookup 1 to tgt_free/2 */
end_key = 1 + batch_size;
for (key = 1; key < end_key; key++) {
assert(!bpf_map_lookup(lru_map_fd, &key, value));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
/* Insert 1+tgt_free to 2*tgt_free
* => 1+tgt_free/2 to LOCALFREE_TARGET will be
* removed by LRU
*/
key = 1 + tgt_free;
end_key = key + tgt_free;
for (; key < end_key; key++) {
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
assert(map_equal(lru_map_fd, expected_map_fd));
close(expected_map_fd);
close(lru_map_fd);
printf("Pass\n");
}
/* Size of the LRU map 1.5 * tgt_free
* Insert 1 to tgt_free (+tgt_free keys)
* Update 1 to tgt_free/2
* => The original 1 to tgt_free/2 will be removed due to
* the LRU shrink process
* Re-insert 1 to tgt_free/2 again and do a lookup immeidately
* Insert 1+tgt_free to tgt_free*3/2
* Insert 1+tgt_free*3/2 to tgt_free*5/2
* => Key 1+tgt_free to tgt_free*3/2
* will be removed from LRU because it has never
* been lookup and ref bit is not set
*/
static void test_lru_sanity2(int map_type, int map_flags, unsigned int tgt_free)
{
unsigned long long key, value[nr_cpus];
unsigned long long end_key;
int lru_map_fd, expected_map_fd;
unsigned int batch_size;
unsigned int map_size;
if (map_flags & BPF_F_NO_COMMON_LRU)
/* Ther percpu lru list (i.e each cpu has its own LRU
* list) does not have a local free list. Hence,
* it will only free old nodes till there is no free
* from the LRU list. Hence, this test does not apply
* to BPF_F_NO_COMMON_LRU
*/
return;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
batch_size = tgt_free / 2;
assert(batch_size * 2 == tgt_free);
map_size = tgt_free + batch_size;
if (map_flags & BPF_F_NO_COMMON_LRU)
lru_map_fd = create_map(map_type, map_flags,
map_size * nr_cpus);
else
lru_map_fd = create_map(map_type, map_flags, map_size);
assert(lru_map_fd != -1);
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
assert(expected_map_fd != -1);
value[0] = 1234;
/* Insert 1 to tgt_free (+tgt_free keys) */
end_key = 1 + tgt_free;
for (key = 1; key < end_key; key++)
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
/* Any bpf_map_update will require to acquire a new node
* from LRU first.
*
* The local list is running out of free nodes.
* It gets from the global LRU list which tries to
* shrink the inactive list to get tgt_free
* number of free nodes.
*
* Hence, the oldest key 1 to tgt_free/2
* are removed from the LRU list.
*/
key = 1;
if (map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_delete(lru_map_fd, &key));
} else {
assert(bpf_map_update(lru_map_fd, &key, value, BPF_EXIST));
}
/* Re-insert 1 to tgt_free/2 again and do a lookup
* immeidately.
*/
end_key = 1 + batch_size;
value[0] = 4321;
for (key = 1; key < end_key; key++) {
assert(bpf_map_lookup(lru_map_fd, &key, value));
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_lookup(lru_map_fd, &key, value));
assert(value[0] == 4321);
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
value[0] = 1234;
/* Insert 1+tgt_free to tgt_free*3/2 */
end_key = 1 + tgt_free + batch_size;
for (key = 1 + tgt_free; key < end_key; key++)
/* These newly added but not referenced keys will be
* gone during the next LRU shrink.
*/
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
/* Insert 1+tgt_free*3/2 to tgt_free*5/2 */
end_key = key + tgt_free;
for (; key < end_key; key++) {
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
assert(map_equal(lru_map_fd, expected_map_fd));
close(expected_map_fd);
close(lru_map_fd);
printf("Pass\n");
}
/* Size of the LRU map is 2*tgt_free
* It is to test the active/inactive list rotation
* Insert 1 to 2*tgt_free (+2*tgt_free keys)
* Lookup key 1 to tgt_free*3/2
* Add 1+2*tgt_free to tgt_free*5/2 (+tgt_free/2 keys)
* => key 1+tgt_free*3/2 to 2*tgt_free are removed from LRU
*/
static void test_lru_sanity3(int map_type, int map_flags, unsigned int tgt_free)
{
unsigned long long key, end_key, value[nr_cpus];
int lru_map_fd, expected_map_fd;
unsigned int batch_size;
unsigned int map_size;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
batch_size = tgt_free / 2;
assert(batch_size * 2 == tgt_free);
map_size = tgt_free * 2;
if (map_flags & BPF_F_NO_COMMON_LRU)
lru_map_fd = create_map(map_type, map_flags,
map_size * nr_cpus);
else
lru_map_fd = create_map(map_type, map_flags, map_size);
assert(lru_map_fd != -1);
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0, map_size);
assert(expected_map_fd != -1);
value[0] = 1234;
/* Insert 1 to 2*tgt_free (+2*tgt_free keys) */
end_key = 1 + (2 * tgt_free);
for (key = 1; key < end_key; key++)
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
/* Lookup key 1 to tgt_free*3/2 */
end_key = tgt_free + batch_size;
for (key = 1; key < end_key; key++) {
assert(!bpf_map_lookup(lru_map_fd, &key, value));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
/* Add 1+2*tgt_free to tgt_free*5/2
* (+tgt_free/2 keys)
*/
key = 2 * tgt_free + 1;
end_key = key + batch_size;
for (; key < end_key; key++) {
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
assert(map_equal(lru_map_fd, expected_map_fd));
close(expected_map_fd);
close(lru_map_fd);
printf("Pass\n");
}
/* Test deletion */
static void test_lru_sanity4(int map_type, int map_flags, unsigned int tgt_free)
{
int lru_map_fd, expected_map_fd;
unsigned long long key, value[nr_cpus];
unsigned long long end_key;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
assert(sched_next_online(0, 0) != -1);
if (map_flags & BPF_F_NO_COMMON_LRU)
lru_map_fd = create_map(map_type, map_flags,
3 * tgt_free * nr_cpus);
else
lru_map_fd = create_map(map_type, map_flags, 3 * tgt_free);
assert(lru_map_fd != -1);
expected_map_fd = create_map(BPF_MAP_TYPE_HASH, 0,
3 * tgt_free);
assert(expected_map_fd != -1);
value[0] = 1234;
for (key = 1; key <= 2 * tgt_free; key++)
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
key = 1;
assert(bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
for (key = 1; key <= tgt_free; key++) {
assert(!bpf_map_lookup(lru_map_fd, &key, value));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
for (; key <= 2 * tgt_free; key++) {
assert(!bpf_map_delete(lru_map_fd, &key));
assert(bpf_map_delete(lru_map_fd, &key));
}
end_key = key + 2 * tgt_free;
for (; key < end_key; key++) {
assert(!bpf_map_update(lru_map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_update(expected_map_fd, &key, value,
BPF_NOEXIST));
}
assert(map_equal(lru_map_fd, expected_map_fd));
close(expected_map_fd);
close(lru_map_fd);
printf("Pass\n");
}
static void do_test_lru_sanity5(unsigned long long last_key, int map_fd)
{
unsigned long long key, value[nr_cpus];
/* Ensure the last key inserted by previous CPU can be found */
assert(!bpf_map_lookup(map_fd, &last_key, value));
value[0] = 1234;
key = last_key + 1;
assert(!bpf_map_update(map_fd, &key, value, BPF_NOEXIST));
assert(!bpf_map_lookup(map_fd, &key, value));
/* Cannot find the last key because it was removed by LRU */
assert(bpf_map_lookup(map_fd, &last_key, value));
}
/* Test map with only one element */
static void test_lru_sanity5(int map_type, int map_flags)
{
unsigned long long key, value[nr_cpus];
int next_sched_cpu = 0;
int map_fd;
int i;
if (map_flags & BPF_F_NO_COMMON_LRU)
return;
printf("%s (map_type:%d map_flags:0x%X): ", __func__, map_type,
map_flags);
map_fd = create_map(map_type, map_flags, 1);
assert(map_fd != -1);
value[0] = 1234;
key = 0;
assert(!bpf_map_update(map_fd, &key, value, BPF_NOEXIST));
for (i = 0; i < nr_cpus; i++) {
pid_t pid;
pid = fork();
if (pid == 0) {
next_sched_cpu = sched_next_online(0, next_sched_cpu);
if (next_sched_cpu != -1)
do_test_lru_sanity5(key, map_fd);
exit(0);
} else if (pid == -1) {
printf("couldn't spawn #%d process\n", i);
exit(1);
} else {
int status;
/* It is mostly redundant and just allow the parent
* process to update next_shced_cpu for the next child
* process
*/
next_sched_cpu = sched_next_online(pid, next_sched_cpu);
assert(waitpid(pid, &status, 0) == pid);
assert(status == 0);
key++;
}
}
close(map_fd);
printf("Pass\n");
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
int map_types[] = {BPF_MAP_TYPE_LRU_HASH,
BPF_MAP_TYPE_LRU_PERCPU_HASH};
int map_flags[] = {0, BPF_F_NO_COMMON_LRU};
int t, f;
setbuf(stdout, NULL);
assert(!setrlimit(RLIMIT_MEMLOCK, &r));
nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
assert(nr_cpus != -1);
printf("nr_cpus:%d\n\n", nr_cpus);
for (f = 0; f < sizeof(map_flags) / sizeof(*map_flags); f++) {
unsigned int tgt_free = (map_flags[f] & BPF_F_NO_COMMON_LRU) ?
PERCPU_FREE_TARGET : LOCAL_FREE_TARGET;
for (t = 0; t < sizeof(map_types) / sizeof(*map_types); t++) {
test_lru_sanity0(map_types[t], map_flags[f]);
test_lru_sanity1(map_types[t], map_flags[f], tgt_free);
test_lru_sanity2(map_types[t], map_flags[f], tgt_free);
test_lru_sanity3(map_types[t], map_flags[f], tgt_free);
test_lru_sanity4(map_types[t], map_flags[f], tgt_free);
test_lru_sanity5(map_types[t], map_flags[f]);
printf("\n");
}
}
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment