Commit e255d327 authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov

Merge branch 'bpf-ring-buffer'

Andrii Nakryiko says:

====================
Implement a new BPF ring buffer, as presented at BPF virtual conference ([0]).
It presents an alternative to perf buffer, following its semantics closely,
but allowing sharing same instance of ring buffer across multiple CPUs
efficiently.

Most patches have extensive commentary explaining various aspects, so I'll
keep cover letter short. Overall structure of the patch set:
- patch #1 adds BPF ring buffer implementation to kernel and necessary
  verifier support;
- patch #2 adds libbpf consumer implementation for BPF ringbuf;
- patch #3 adds selftest, both for single BPF ring buf use case, as well as
  using it with array/hash of maps;
- patch #4 adds extensive benchmarks and provide some analysis in commit
  message, it builds upon selftests/bpf's bench runner.
- patch #5 adds most of patch #1 commit message as a doc under
  Documentation/bpf/ringbuf.rst.

Litmus tests, validating consumer/producer protocols and memory orderings,
were moved out as discussed in [1] and are going to be posted against -rcu
tree and put under Documentation/litmus-tests/bpf-rb.

  [0] https://docs.google.com/presentation/d/18ITdg77Bj6YDOH2LghxrnFxiPWe0fAqcmJY95t_qr0w
  [1] https://lkml.org/lkml/2020/5/22/1011

v3->v4:
- fix ringbuf freeing (vunmap, __free_page); verified with a trivial loop
  creating and closing ringbuf map endlessly (Daniel);

v2->v3:
- dropped unnecessary smp_wmb() (Paul);
- verifier reference type enhancement patch was dropped (Alexei);
- better verifier message for various memory access checks (Alexei);
- clarified a bit roundup_len() bit shifting (Alexei);
- converted doc to .rst (Alexei);
- fixed warning on 32-bit arches regarding tautological ring area size check.

v1->v2:
- commit()/discard()/output() accept flags (NO_WAKEUP/FORCE_WAKEUP) (Stanislav);
- bpf_ringbuf_query() added, returning available data size, ringbuf size,
  consumer/producer positions, needed to implement smarter notification policy
  (Stanislav);
- added ringbuf UAPI constants to include/uapi/linux/bpf.h (Jonathan);
- fixed sample size check, added proper ringbuf size check (Jonathan, Alexei);
- wake_up_all() is done through irq_work (Alexei);
- consistent use of smp_load_acquire/smp_store_release, no
  READ_ONCE/WRITE_ONCE (Alexei);
- added Documentation/bpf/ringbuf.txt (Stanislav);
- updated litmus test with smp_load_acquire/smp_store_release changes;
- added ring_buffer__consume() API to libbpf for busy-polling;
- ring_buffer__poll() on success returns number of records consumed;
- fixed EPOLL notifications, don't assume available data, done similarly to
  perfbuf's implementation;
- both ringbuf and perfbuf now have --rb-sampled mode, instead of
  pb-raw/pb-custom mode, updated benchmark results;
- extended ringbuf selftests to validate epoll logic/manual notification
  logic, as well as bpf_ringbuf_query().
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 43dd115b 97abb2b3
This diff is collapsed.
...@@ -90,6 +90,8 @@ struct bpf_map_ops { ...@@ -90,6 +90,8 @@ struct bpf_map_ops {
int (*map_direct_value_meta)(const struct bpf_map *map, int (*map_direct_value_meta)(const struct bpf_map *map,
u64 imm, u32 *off); u64 imm, u32 *off);
int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma); int (*map_mmap)(struct bpf_map *map, struct vm_area_struct *vma);
__poll_t (*map_poll)(struct bpf_map *map, struct file *filp,
struct poll_table_struct *pts);
}; };
struct bpf_map_memory { struct bpf_map_memory {
...@@ -244,6 +246,9 @@ enum bpf_arg_type { ...@@ -244,6 +246,9 @@ enum bpf_arg_type {
ARG_PTR_TO_LONG, /* pointer to long */ ARG_PTR_TO_LONG, /* pointer to long */
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock (fullsock) */
ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */ ARG_PTR_TO_BTF_ID, /* pointer to in-kernel struct */
ARG_PTR_TO_ALLOC_MEM, /* pointer to dynamically allocated memory */
ARG_PTR_TO_ALLOC_MEM_OR_NULL, /* pointer to dynamically allocated memory or NULL */
ARG_CONST_ALLOC_SIZE_OR_ZERO, /* number of allocated bytes requested */
}; };
/* type of values returned from helper functions */ /* type of values returned from helper functions */
...@@ -255,6 +260,7 @@ enum bpf_return_type { ...@@ -255,6 +260,7 @@ enum bpf_return_type {
RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */
RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */
RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */ RET_PTR_TO_SOCK_COMMON_OR_NULL, /* returns a pointer to a sock_common or NULL */
RET_PTR_TO_ALLOC_MEM_OR_NULL, /* returns a pointer to dynamically allocated memory or NULL */
}; };
/* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs
...@@ -322,6 +328,8 @@ enum bpf_reg_type { ...@@ -322,6 +328,8 @@ enum bpf_reg_type {
PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */ PTR_TO_XDP_SOCK, /* reg points to struct xdp_sock */
PTR_TO_BTF_ID, /* reg points to kernel struct */ PTR_TO_BTF_ID, /* reg points to kernel struct */
PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */ PTR_TO_BTF_ID_OR_NULL, /* reg points to kernel struct or NULL */
PTR_TO_MEM, /* reg points to valid memory region */
PTR_TO_MEM_OR_NULL, /* reg points to valid memory region or NULL */
}; };
/* The information passed from prog-specific *_is_valid_access /* The information passed from prog-specific *_is_valid_access
...@@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto; ...@@ -1611,6 +1619,11 @@ extern const struct bpf_func_proto bpf_tcp_sock_proto;
extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_jiffies64_proto;
extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto;
extern const struct bpf_func_proto bpf_event_output_data_proto; extern const struct bpf_func_proto bpf_event_output_data_proto;
extern const struct bpf_func_proto bpf_ringbuf_output_proto;
extern const struct bpf_func_proto bpf_ringbuf_reserve_proto;
extern const struct bpf_func_proto bpf_ringbuf_submit_proto;
extern const struct bpf_func_proto bpf_ringbuf_discard_proto;
extern const struct bpf_func_proto bpf_ringbuf_query_proto;
const struct bpf_func_proto *bpf_tracing_func_proto( const struct bpf_func_proto *bpf_tracing_func_proto(
enum bpf_func_id func_id, const struct bpf_prog *prog); enum bpf_func_id func_id, const struct bpf_prog *prog);
......
...@@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) ...@@ -118,6 +118,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops)
#if defined(CONFIG_BPF_JIT) #if defined(CONFIG_BPF_JIT)
BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_STRUCT_OPS, bpf_struct_ops_map_ops)
#endif #endif
BPF_MAP_TYPE(BPF_MAP_TYPE_RINGBUF, ringbuf_map_ops)
BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint) BPF_LINK_TYPE(BPF_LINK_TYPE_RAW_TRACEPOINT, raw_tracepoint)
BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing) BPF_LINK_TYPE(BPF_LINK_TYPE_TRACING, tracing)
......
...@@ -54,6 +54,8 @@ struct bpf_reg_state { ...@@ -54,6 +54,8 @@ struct bpf_reg_state {
u32 btf_id; /* for PTR_TO_BTF_ID */ u32 btf_id; /* for PTR_TO_BTF_ID */
u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */
/* Max size from any of the above. */ /* Max size from any of the above. */
unsigned long raw; unsigned long raw;
}; };
...@@ -63,6 +65,8 @@ struct bpf_reg_state { ...@@ -63,6 +65,8 @@ struct bpf_reg_state {
* offset, so they can share range knowledge. * offset, so they can share range knowledge.
* For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we
* came from, when one is tested for != NULL. * came from, when one is tested for != NULL.
* For PTR_TO_MEM_OR_NULL this is used to identify memory allocation
* for the purpose of tracking that it's freed.
* For PTR_TO_SOCKET this is used to share which pointers retain the * For PTR_TO_SOCKET this is used to share which pointers retain the
* same reference to the socket, to determine proper reference freeing. * same reference to the socket, to determine proper reference freeing.
*/ */
......
...@@ -147,6 +147,7 @@ enum bpf_map_type { ...@@ -147,6 +147,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_STRUCT_OPS,
BPF_MAP_TYPE_RINGBUF,
}; };
/* Note that tracing related programs such as /* Note that tracing related programs such as
...@@ -3157,6 +3158,59 @@ union bpf_attr { ...@@ -3157,6 +3158,59 @@ union bpf_attr {
* **bpf_sk_cgroup_id**\ (). * **bpf_sk_cgroup_id**\ ().
* Return * Return
* The id is returned or 0 in case the id could not be retrieved. * The id is returned or 0 in case the id could not be retrieved.
*
* void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* Description
* Copy *size* bytes from *data* into a ring buffer *ringbuf*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
* new data availability is sent.
* IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
* new data availability is sent unconditionally.
* Return
* 0, on success;
* < 0, on error.
*
* void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
* Description
* Reserve *size* bytes of payload in a ring buffer *ringbuf*.
* Return
* Valid pointer with *size* bytes of memory available; NULL,
* otherwise.
*
* void bpf_ringbuf_submit(void *data, u64 flags)
* Description
* Submit reserved ring buffer sample, pointed to by *data*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
* new data availability is sent.
* IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
* new data availability is sent unconditionally.
* Return
* Nothing. Always succeeds.
*
* void bpf_ringbuf_discard(void *data, u64 flags)
* Description
* Discard reserved ring buffer sample, pointed to by *data*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
* new data availability is sent.
* IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
* new data availability is sent unconditionally.
* Return
* Nothing. Always succeeds.
*
* u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
* Description
* Query various characteristics of provided ring buffer. What
* exactly is queries is determined by *flags*:
* - BPF_RB_AVAIL_DATA - amount of data not yet consumed;
* - BPF_RB_RING_SIZE - the size of ring buffer;
* - BPF_RB_CONS_POS - consumer position (can wrap around);
* - BPF_RB_PROD_POS - producer(s) position (can wrap around);
* Data returned is just a momentary snapshots of actual values
* and could be inaccurate, so this facility should be used to
* power heuristics and for reporting, not to make 100% correct
* calculation.
* Return
* Requested value, or 0, if flags are not recognized.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3288,7 +3342,12 @@ union bpf_attr { ...@@ -3288,7 +3342,12 @@ union bpf_attr {
FN(seq_printf), \ FN(seq_printf), \
FN(seq_write), \ FN(seq_write), \
FN(sk_cgroup_id), \ FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id), FN(sk_ancestor_cgroup_id), \
FN(ringbuf_output), \
FN(ringbuf_reserve), \
FN(ringbuf_submit), \
FN(ringbuf_discard), \
FN(ringbuf_query),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -3398,6 +3457,29 @@ enum { ...@@ -3398,6 +3457,29 @@ enum {
BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0),
}; };
/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
* BPF_FUNC_bpf_ringbuf_output flags.
*/
enum {
BPF_RB_NO_WAKEUP = (1ULL << 0),
BPF_RB_FORCE_WAKEUP = (1ULL << 1),
};
/* BPF_FUNC_bpf_ringbuf_query flags */
enum {
BPF_RB_AVAIL_DATA = 0,
BPF_RB_RING_SIZE = 1,
BPF_RB_CONS_POS = 2,
BPF_RB_PROD_POS = 3,
};
/* BPF ring buffer constants */
enum {
BPF_RINGBUF_BUSY_BIT = (1U << 31),
BPF_RINGBUF_DISCARD_BIT = (1U << 30),
BPF_RINGBUF_HDR_SZ = 8,
};
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_NET,
......
...@@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init) ...@@ -4,7 +4,7 @@ CFLAGS_core.o += $(call cc-disable-warning, override-init)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o
obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_JIT) += trampoline.o
obj-$(CONFIG_BPF_SYSCALL) += btf.o obj-$(CONFIG_BPF_SYSCALL) += btf.o
......
...@@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id) ...@@ -635,6 +635,16 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ktime_get_ns_proto; return &bpf_ktime_get_ns_proto;
case BPF_FUNC_ktime_get_boot_ns: case BPF_FUNC_ktime_get_boot_ns:
return &bpf_ktime_get_boot_ns_proto; return &bpf_ktime_get_boot_ns_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
return &bpf_ringbuf_reserve_proto;
case BPF_FUNC_ringbuf_submit:
return &bpf_ringbuf_submit_proto;
case BPF_FUNC_ringbuf_discard:
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
default: default:
break; break;
} }
......
This diff is collapsed.
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <linux/audit.h> #include <linux/audit.h>
#include <uapi/linux/btf.h> #include <uapi/linux/btf.h>
#include <linux/bpf_lsm.h> #include <linux/bpf_lsm.h>
#include <linux/poll.h>
#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
(map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
...@@ -662,6 +663,16 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) ...@@ -662,6 +663,16 @@ static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma)
return err; return err;
} }
static __poll_t bpf_map_poll(struct file *filp, struct poll_table_struct *pts)
{
struct bpf_map *map = filp->private_data;
if (map->ops->map_poll)
return map->ops->map_poll(map, filp, pts);
return EPOLLERR;
}
const struct file_operations bpf_map_fops = { const struct file_operations bpf_map_fops = {
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_map_show_fdinfo, .show_fdinfo = bpf_map_show_fdinfo,
...@@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = { ...@@ -670,6 +681,7 @@ const struct file_operations bpf_map_fops = {
.read = bpf_dummy_read, .read = bpf_dummy_read,
.write = bpf_dummy_write, .write = bpf_dummy_write,
.mmap = bpf_map_mmap, .mmap = bpf_map_mmap,
.poll = bpf_map_poll,
}; };
int bpf_map_new_fd(struct bpf_map *map, int flags) int bpf_map_new_fd(struct bpf_map *map, int flags)
......
This diff is collapsed.
...@@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -1088,6 +1088,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_perf_event_read_value_proto; return &bpf_perf_event_read_value_proto;
case BPF_FUNC_get_ns_current_pid_tgid: case BPF_FUNC_get_ns_current_pid_tgid:
return &bpf_get_ns_current_pid_tgid_proto; return &bpf_get_ns_current_pid_tgid_proto;
case BPF_FUNC_ringbuf_output:
return &bpf_ringbuf_output_proto;
case BPF_FUNC_ringbuf_reserve:
return &bpf_ringbuf_reserve_proto;
case BPF_FUNC_ringbuf_submit:
return &bpf_ringbuf_submit_proto;
case BPF_FUNC_ringbuf_discard:
return &bpf_ringbuf_discard_proto;
case BPF_FUNC_ringbuf_query:
return &bpf_ringbuf_query_proto;
default: default:
return NULL; return NULL;
} }
......
...@@ -147,6 +147,7 @@ enum bpf_map_type { ...@@ -147,6 +147,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_SK_STORAGE, BPF_MAP_TYPE_SK_STORAGE,
BPF_MAP_TYPE_DEVMAP_HASH, BPF_MAP_TYPE_DEVMAP_HASH,
BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_STRUCT_OPS,
BPF_MAP_TYPE_RINGBUF,
}; };
/* Note that tracing related programs such as /* Note that tracing related programs such as
...@@ -3157,6 +3158,59 @@ union bpf_attr { ...@@ -3157,6 +3158,59 @@ union bpf_attr {
* **bpf_sk_cgroup_id**\ (). * **bpf_sk_cgroup_id**\ ().
* Return * Return
* The id is returned or 0 in case the id could not be retrieved. * The id is returned or 0 in case the id could not be retrieved.
*
* void *bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
* Description
* Copy *size* bytes from *data* into a ring buffer *ringbuf*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
* new data availability is sent.
* IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
* new data availability is sent unconditionally.
* Return
* 0, on success;
* < 0, on error.
*
* void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags)
* Description
* Reserve *size* bytes of payload in a ring buffer *ringbuf*.
* Return
* Valid pointer with *size* bytes of memory available; NULL,
* otherwise.
*
* void bpf_ringbuf_submit(void *data, u64 flags)
* Description
* Submit reserved ring buffer sample, pointed to by *data*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
* new data availability is sent.
* IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
* new data availability is sent unconditionally.
* Return
* Nothing. Always succeeds.
*
* void bpf_ringbuf_discard(void *data, u64 flags)
* Description
* Discard reserved ring buffer sample, pointed to by *data*.
* If BPF_RB_NO_WAKEUP is specified in *flags*, no notification of
* new data availability is sent.
* IF BPF_RB_FORCE_WAKEUP is specified in *flags*, notification of
* new data availability is sent unconditionally.
* Return
* Nothing. Always succeeds.
*
* u64 bpf_ringbuf_query(void *ringbuf, u64 flags)
* Description
* Query various characteristics of provided ring buffer. What
* exactly is queries is determined by *flags*:
* - BPF_RB_AVAIL_DATA - amount of data not yet consumed;
* - BPF_RB_RING_SIZE - the size of ring buffer;
* - BPF_RB_CONS_POS - consumer position (can wrap around);
* - BPF_RB_PROD_POS - producer(s) position (can wrap around);
* Data returned is just a momentary snapshots of actual values
* and could be inaccurate, so this facility should be used to
* power heuristics and for reporting, not to make 100% correct
* calculation.
* Return
* Requested value, or 0, if flags are not recognized.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3288,7 +3342,12 @@ union bpf_attr { ...@@ -3288,7 +3342,12 @@ union bpf_attr {
FN(seq_printf), \ FN(seq_printf), \
FN(seq_write), \ FN(seq_write), \
FN(sk_cgroup_id), \ FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id), FN(sk_ancestor_cgroup_id), \
FN(ringbuf_output), \
FN(ringbuf_reserve), \
FN(ringbuf_submit), \
FN(ringbuf_discard), \
FN(ringbuf_query),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call * function eBPF program intends to call
...@@ -3398,6 +3457,29 @@ enum { ...@@ -3398,6 +3457,29 @@ enum {
BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0),
}; };
/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and
* BPF_FUNC_bpf_ringbuf_output flags.
*/
enum {
BPF_RB_NO_WAKEUP = (1ULL << 0),
BPF_RB_FORCE_WAKEUP = (1ULL << 1),
};
/* BPF_FUNC_bpf_ringbuf_query flags */
enum {
BPF_RB_AVAIL_DATA = 0,
BPF_RB_RING_SIZE = 1,
BPF_RB_CONS_POS = 2,
BPF_RB_PROD_POS = 3,
};
/* BPF ring buffer constants */
enum {
BPF_RINGBUF_BUSY_BIT = (1U << 31),
BPF_RINGBUF_DISCARD_BIT = (1U << 30),
BPF_RINGBUF_HDR_SZ = 8,
};
/* Mode for BPF_FUNC_skb_adjust_room helper. */ /* Mode for BPF_FUNC_skb_adjust_room helper. */
enum bpf_adj_room_mode { enum bpf_adj_room_mode {
BPF_ADJ_ROOM_NET, BPF_ADJ_ROOM_NET,
......
libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \ libbpf-y := libbpf.o bpf.o nlattr.o btf.o libbpf_errno.o str_error.o \
netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \ netlink.o bpf_prog_linfo.o libbpf_probes.o xsk.o hashmap.o \
btf_dump.o btf_dump.o ringbuf.o
...@@ -478,6 +478,27 @@ LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); ...@@ -478,6 +478,27 @@ LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags);
LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info,
size_t info_size, __u32 flags); size_t info_size, __u32 flags);
/* Ring buffer APIs */
struct ring_buffer;
typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size);
struct ring_buffer_opts {
size_t sz; /* size of this struct, for forward/backward compatiblity */
};
#define ring_buffer_opts__last_field sz
LIBBPF_API struct ring_buffer *
ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
const struct ring_buffer_opts *opts);
LIBBPF_API void ring_buffer__free(struct ring_buffer *rb);
LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd,
ring_buffer_sample_fn sample_cb, void *ctx);
LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms);
LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb);
/* Perf buffer APIs */
struct perf_buffer; struct perf_buffer;
typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu, typedef void (*perf_buffer_sample_fn)(void *ctx, int cpu,
......
...@@ -263,4 +263,9 @@ LIBBPF_0.0.9 { ...@@ -263,4 +263,9 @@ LIBBPF_0.0.9 {
bpf_link_get_next_id; bpf_link_get_next_id;
bpf_program__attach_iter; bpf_program__attach_iter;
perf_buffer__consume; perf_buffer__consume;
ring_buffer__add;
ring_buffer__consume;
ring_buffer__free;
ring_buffer__new;
ring_buffer__poll;
} LIBBPF_0.0.8; } LIBBPF_0.0.8;
...@@ -238,6 +238,11 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) ...@@ -238,6 +238,11 @@ bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex)
if (btf_fd < 0) if (btf_fd < 0)
return false; return false;
break; break;
case BPF_MAP_TYPE_RINGBUF:
key_size = 0;
value_size = 0;
max_entries = 4096;
break;
case BPF_MAP_TYPE_UNSPEC: case BPF_MAP_TYPE_UNSPEC:
case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_HASH:
case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_ARRAY:
......
// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
/*
* Ring buffer operations.
*
* Copyright (C) 2020 Facebook, Inc.
*/
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <linux/err.h>
#include <linux/bpf.h>
#include <asm/barrier.h>
#include <sys/mman.h>
#include <sys/epoll.h>
#include <tools/libc_compat.h>
#include "libbpf.h"
#include "libbpf_internal.h"
#include "bpf.h"
/* make sure libbpf doesn't use kernel-only integer typedefs */
#pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64
struct ring {
ring_buffer_sample_fn sample_cb;
void *ctx;
void *data;
unsigned long *consumer_pos;
unsigned long *producer_pos;
unsigned long mask;
int map_fd;
};
struct ring_buffer {
struct epoll_event *events;
struct ring *rings;
size_t page_size;
int epoll_fd;
int ring_cnt;
};
static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r)
{
if (r->consumer_pos) {
munmap(r->consumer_pos, rb->page_size);
r->consumer_pos = NULL;
}
if (r->producer_pos) {
munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1));
r->producer_pos = NULL;
}
}
/* Add extra RINGBUF maps to this ring buffer manager */
int ring_buffer__add(struct ring_buffer *rb, int map_fd,
ring_buffer_sample_fn sample_cb, void *ctx)
{
struct bpf_map_info info;
__u32 len = sizeof(info);
struct epoll_event *e;
struct ring *r;
void *tmp;
int err;
memset(&info, 0, sizeof(info));
err = bpf_obj_get_info_by_fd(map_fd, &info, &len);
if (err) {
err = -errno;
pr_warn("ringbuf: failed to get map info for fd=%d: %d\n",
map_fd, err);
return err;
}
if (info.type != BPF_MAP_TYPE_RINGBUF) {
pr_warn("ringbuf: map fd=%d is not BPF_MAP_TYPE_RINGBUF\n",
map_fd);
return -EINVAL;
}
tmp = reallocarray(rb->rings, rb->ring_cnt + 1, sizeof(*rb->rings));
if (!tmp)
return -ENOMEM;
rb->rings = tmp;
tmp = reallocarray(rb->events, rb->ring_cnt + 1, sizeof(*rb->events));
if (!tmp)
return -ENOMEM;
rb->events = tmp;
r = &rb->rings[rb->ring_cnt];
memset(r, 0, sizeof(*r));
r->map_fd = map_fd;
r->sample_cb = sample_cb;
r->ctx = ctx;
r->mask = info.max_entries - 1;
/* Map writable consumer page */
tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
map_fd, 0);
if (tmp == MAP_FAILED) {
err = -errno;
pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n",
map_fd, err);
return err;
}
r->consumer_pos = tmp;
/* Map read-only producer page and data pages. We map twice as big
* data size to allow simple reading of samples that wrap around the
* end of a ring buffer. See kernel implementation for details.
* */
tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, PROT_READ,
MAP_SHARED, map_fd, rb->page_size);
if (tmp == MAP_FAILED) {
err = -errno;
ringbuf_unmap_ring(rb, r);
pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n",
map_fd, err);
return err;
}
r->producer_pos = tmp;
r->data = tmp + rb->page_size;
e = &rb->events[rb->ring_cnt];
memset(e, 0, sizeof(*e));
e->events = EPOLLIN;
e->data.fd = rb->ring_cnt;
if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) {
err = -errno;
ringbuf_unmap_ring(rb, r);
pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n",
map_fd, err);
return err;
}
rb->ring_cnt++;
return 0;
}
void ring_buffer__free(struct ring_buffer *rb)
{
int i;
if (!rb)
return;
for (i = 0; i < rb->ring_cnt; ++i)
ringbuf_unmap_ring(rb, &rb->rings[i]);
if (rb->epoll_fd >= 0)
close(rb->epoll_fd);
free(rb->events);
free(rb->rings);
free(rb);
}
struct ring_buffer *
ring_buffer__new(int map_fd, ring_buffer_sample_fn sample_cb, void *ctx,
const struct ring_buffer_opts *opts)
{
struct ring_buffer *rb;
int err;
if (!OPTS_VALID(opts, ring_buffer_opts))
return NULL;
rb = calloc(1, sizeof(*rb));
if (!rb)
return NULL;
rb->page_size = getpagesize();
rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC);
if (rb->epoll_fd < 0) {
err = -errno;
pr_warn("ringbuf: failed to create epoll instance: %d\n", err);
goto err_out;
}
err = ring_buffer__add(rb, map_fd, sample_cb, ctx);
if (err)
goto err_out;
return rb;
err_out:
ring_buffer__free(rb);
return NULL;
}
static inline int roundup_len(__u32 len)
{
/* clear out top 2 bits (discard and busy, if set) */
len <<= 2;
len >>= 2;
/* add length prefix */
len += BPF_RINGBUF_HDR_SZ;
/* round up to 8 byte alignment */
return (len + 7) / 8 * 8;
}
static int ringbuf_process_ring(struct ring* r)
{
int *len_ptr, len, err, cnt = 0;
unsigned long cons_pos, prod_pos;
bool got_new_data;
void *sample;
cons_pos = smp_load_acquire(r->consumer_pos);
do {
got_new_data = false;
prod_pos = smp_load_acquire(r->producer_pos);
while (cons_pos < prod_pos) {
len_ptr = r->data + (cons_pos & r->mask);
len = smp_load_acquire(len_ptr);
/* sample not committed yet, bail out for now */
if (len & BPF_RINGBUF_BUSY_BIT)
goto done;
got_new_data = true;
cons_pos += roundup_len(len);
if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
err = r->sample_cb(r->ctx, sample, len);
if (err) {
/* update consumer pos and bail out */
smp_store_release(r->consumer_pos,
cons_pos);
return err;
}
cnt++;
}
smp_store_release(r->consumer_pos, cons_pos);
}
} while (got_new_data);
done:
return cnt;
}
/* Consume available ring buffer(s) data without event polling.
* Returns number of records consumed across all registered ring buffers, or
* negative number if any of the callbacks return error.
*/
int ring_buffer__consume(struct ring_buffer *rb)
{
int i, err, res = 0;
for (i = 0; i < rb->ring_cnt; i++) {
struct ring *ring = &rb->rings[i];
err = ringbuf_process_ring(ring);
if (err < 0)
return err;
res += err;
}
return res;
}
/* Poll for available data and consume records, if any are available.
* Returns number of records consumed, or negative number, if any of the
* registered callbacks returned error.
*/
int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms)
{
int i, cnt, err, res = 0;
cnt = epoll_wait(rb->epoll_fd, rb->events, rb->ring_cnt, timeout_ms);
for (i = 0; i < cnt; i++) {
__u32 ring_id = rb->events[i].data.fd;
struct ring *ring = &rb->rings[ring_id];
err = ringbuf_process_ring(ring);
if (err < 0)
return err;
res += cnt;
}
return cnt < 0 ? -errno : res;
}
...@@ -413,12 +413,15 @@ $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h ...@@ -413,12 +413,15 @@ $(OUTPUT)/bench_%.o: benchs/bench_%.c bench.h
$(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@ $(CC) $(CFLAGS) -c $(filter %.c,$^) $(LDLIBS) -o $@
$(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h $(OUTPUT)/bench_rename.o: $(OUTPUT)/test_overhead.skel.h
$(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h $(OUTPUT)/bench_trigger.o: $(OUTPUT)/trigger_bench.skel.h
$(OUTPUT)/bench_ringbufs.o: $(OUTPUT)/ringbuf_bench.skel.h \
$(OUTPUT)/perfbuf_bench.skel.h
$(OUTPUT)/bench.o: bench.h testing_helpers.h $(OUTPUT)/bench.o: bench.h testing_helpers.h
$(OUTPUT)/bench: LDLIBS += -lm $(OUTPUT)/bench: LDLIBS += -lm
$(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \ $(OUTPUT)/bench: $(OUTPUT)/bench.o $(OUTPUT)/testing_helpers.o \
$(OUTPUT)/bench_count.o \ $(OUTPUT)/bench_count.o \
$(OUTPUT)/bench_rename.o \ $(OUTPUT)/bench_rename.o \
$(OUTPUT)/bench_trigger.o $(OUTPUT)/bench_trigger.o \
$(OUTPUT)/bench_ringbufs.o
$(call msg,BINARY,,$@) $(call msg,BINARY,,$@)
$(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS) $(CC) $(LDFLAGS) -o $@ $(filter %.a %.o,$^) $(LDLIBS)
......
...@@ -130,6 +130,13 @@ static const struct argp_option opts[] = { ...@@ -130,6 +130,13 @@ static const struct argp_option opts[] = {
{}, {},
}; };
extern struct argp bench_ringbufs_argp;
static const struct argp_child bench_parsers[] = {
{ &bench_ringbufs_argp, 0, "Ring buffers benchmark", 0 },
{},
};
static error_t parse_arg(int key, char *arg, struct argp_state *state) static error_t parse_arg(int key, char *arg, struct argp_state *state)
{ {
static int pos_args; static int pos_args;
...@@ -208,6 +215,7 @@ static void parse_cmdline_args(int argc, char **argv) ...@@ -208,6 +215,7 @@ static void parse_cmdline_args(int argc, char **argv)
.options = opts, .options = opts,
.parser = parse_arg, .parser = parse_arg,
.doc = argp_program_doc, .doc = argp_program_doc,
.children = bench_parsers,
}; };
if (argp_parse(&argp, argc, argv, 0, NULL, NULL)) if (argp_parse(&argp, argc, argv, 0, NULL, NULL))
exit(1); exit(1);
...@@ -310,6 +318,10 @@ extern const struct bench bench_trig_rawtp; ...@@ -310,6 +318,10 @@ extern const struct bench bench_trig_rawtp;
extern const struct bench bench_trig_kprobe; extern const struct bench bench_trig_kprobe;
extern const struct bench bench_trig_fentry; extern const struct bench bench_trig_fentry;
extern const struct bench bench_trig_fmodret; extern const struct bench bench_trig_fmodret;
extern const struct bench bench_rb_libbpf;
extern const struct bench bench_rb_custom;
extern const struct bench bench_pb_libbpf;
extern const struct bench bench_pb_custom;
static const struct bench *benchs[] = { static const struct bench *benchs[] = {
&bench_count_global, &bench_count_global,
...@@ -327,6 +339,10 @@ static const struct bench *benchs[] = { ...@@ -327,6 +339,10 @@ static const struct bench *benchs[] = {
&bench_trig_kprobe, &bench_trig_kprobe,
&bench_trig_fentry, &bench_trig_fentry,
&bench_trig_fmodret, &bench_trig_fmodret,
&bench_rb_libbpf,
&bench_rb_custom,
&bench_pb_libbpf,
&bench_pb_custom,
}; };
static void setup_benchmark() static void setup_benchmark()
......
This diff is collapsed.
#!/bin/bash
set -eufo pipefail
RUN_BENCH="sudo ./bench -w3 -d10 -a"
function hits()
{
echo "$*" | sed -E "s/.*hits\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/"
}
function drops()
{
echo "$*" | sed -E "s/.*drops\s+([0-9]+\.[0-9]+ ± [0-9]+\.[0-9]+M\/s).*/\1/"
}
function header()
{
local len=${#1}
printf "\n%s\n" "$1"
for i in $(seq 1 $len); do printf '='; done
printf '\n'
}
function summarize()
{
bench="$1"
summary=$(echo $2 | tail -n1)
printf "%-20s %s (drops %s)\n" "$bench" "$(hits $summary)" "$(drops $summary)"
}
header "Single-producer, parallel producer"
for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
summarize $b "$($RUN_BENCH $b)"
done
header "Single-producer, parallel producer, sampled notification"
for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
summarize $b "$($RUN_BENCH --rb-sampled $b)"
done
header "Single-producer, back-to-back mode"
for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
summarize $b "$($RUN_BENCH --rb-b2b $b)"
summarize $b-sampled "$($RUN_BENCH --rb-sampled --rb-b2b $b)"
done
header "Ringbuf back-to-back, effect of sample rate"
for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do
summarize "rb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b rb-custom)"
done
header "Perfbuf back-to-back, effect of sample rate"
for b in 1 5 10 25 50 100 250 500 1000 2000 3000; do
summarize "pb-sampled-$b" "$($RUN_BENCH --rb-b2b --rb-batch-cnt $b --rb-sampled --rb-sample-rate $b pb-custom)"
done
header "Ringbuf back-to-back, reserve+commit vs output"
summarize "reserve" "$($RUN_BENCH --rb-b2b rb-custom)"
summarize "output" "$($RUN_BENCH --rb-b2b --rb-use-output rb-custom)"
header "Ringbuf sampled, reserve+commit vs output"
summarize "reserve-sampled" "$($RUN_BENCH --rb-sampled rb-custom)"
summarize "output-sampled" "$($RUN_BENCH --rb-sampled --rb-use-output rb-custom)"
header "Single-producer, consumer/producer competing on the same CPU, low batch count"
for b in rb-libbpf rb-custom pb-libbpf pb-custom; do
summarize $b "$($RUN_BENCH --rb-batch-cnt 1 --rb-sample-rate 1 --prod-affinity 0 --cons-affinity 0 $b)"
done
header "Ringbuf, multi-producer contention"
for b in 1 2 3 4 8 12 16 20 24 28 32 36 40 44 48 52; do
summarize "rb-libbpf nr_prod $b" "$($RUN_BENCH -p$b --rb-batch-cnt 50 rb-libbpf)"
done
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <linux/compiler.h>
#include <asm/barrier.h>
#include <test_progs.h>
#include <sys/mman.h>
#include <sys/epoll.h>
#include <time.h>
#include <sched.h>
#include <signal.h>
#include <pthread.h>
#include <sys/sysinfo.h>
#include <linux/perf_event.h>
#include <linux/ring_buffer.h>
#include "test_ringbuf.skel.h"
#define EDONE 7777
static int duration = 0;
struct sample {
int pid;
int seq;
long value;
char comm[16];
};
static int sample_cnt;
static int process_sample(void *ctx, void *data, size_t len)
{
struct sample *s = data;
sample_cnt++;
switch (s->seq) {
case 0:
CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n",
333L, s->value);
return 0;
case 1:
CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n",
777L, s->value);
return -EDONE;
default:
/* we don't care about the rest */
return 0;
}
}
static struct test_ringbuf *skel;
static struct ring_buffer *ringbuf;
static void trigger_samples()
{
skel->bss->dropped = 0;
skel->bss->total = 0;
skel->bss->discarded = 0;
/* trigger exactly two samples */
skel->bss->value = 333;
syscall(__NR_getpgid);
skel->bss->value = 777;
syscall(__NR_getpgid);
}
static void *poll_thread(void *input)
{
long timeout = (long)input;
return (void *)(long)ring_buffer__poll(ringbuf, timeout);
}
void test_ringbuf(void)
{
const size_t rec_sz = BPF_RINGBUF_HDR_SZ + sizeof(struct sample);
pthread_t thread;
long bg_ret = -1;
int err;
skel = test_ringbuf__open_and_load();
if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n"))
return;
/* only trigger BPF program for current process */
skel->bss->pid = getpid();
ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf),
process_sample, NULL, NULL);
if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n"))
goto cleanup;
err = test_ringbuf__attach(skel);
if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err))
goto cleanup;
trigger_samples();
/* 2 submitted + 1 discarded records */
CHECK(skel->bss->avail_data != 3 * rec_sz,
"err_avail_size", "exp %ld, got %ld\n",
3L * rec_sz, skel->bss->avail_data);
CHECK(skel->bss->ring_size != 4096,
"err_ring_size", "exp %ld, got %ld\n",
4096L, skel->bss->ring_size);
CHECK(skel->bss->cons_pos != 0,
"err_cons_pos", "exp %ld, got %ld\n",
0L, skel->bss->cons_pos);
CHECK(skel->bss->prod_pos != 3 * rec_sz,
"err_prod_pos", "exp %ld, got %ld\n",
3L * rec_sz, skel->bss->prod_pos);
/* poll for samples */
err = ring_buffer__poll(ringbuf, -1);
/* -EDONE is used as an indicator that we are done */
if (CHECK(err != -EDONE, "err_done", "done err: %d\n", err))
goto cleanup;
/* we expect extra polling to return nothing */
err = ring_buffer__poll(ringbuf, 0);
if (CHECK(err != 0, "extra_samples", "poll result: %d\n", err))
goto cleanup;
CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
0L, skel->bss->dropped);
CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
2L, skel->bss->total);
CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n",
1L, skel->bss->discarded);
/* now validate consumer position is updated and returned */
trigger_samples();
CHECK(skel->bss->cons_pos != 3 * rec_sz,
"err_cons_pos", "exp %ld, got %ld\n",
3L * rec_sz, skel->bss->cons_pos);
err = ring_buffer__poll(ringbuf, -1);
CHECK(err <= 0, "poll_err", "err %d\n", err);
/* start poll in background w/ long timeout */
err = pthread_create(&thread, NULL, poll_thread, (void *)(long)10000);
if (CHECK(err, "bg_poll", "pthread_create failed: %d\n", err))
goto cleanup;
/* turn off notifications now */
skel->bss->flags = BPF_RB_NO_WAKEUP;
/* give background thread a bit of a time */
usleep(50000);
trigger_samples();
/* sleeping arbitrarily is bad, but no better way to know that
* epoll_wait() **DID NOT** unblock in background thread
*/
usleep(50000);
/* background poll should still be blocked */
err = pthread_tryjoin_np(thread, (void **)&bg_ret);
if (CHECK(err != EBUSY, "try_join", "err %d\n", err))
goto cleanup;
/* BPF side did everything right */
CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
0L, skel->bss->dropped);
CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
2L, skel->bss->total);
CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n",
1L, skel->bss->discarded);
/* clear flags to return to "adaptive" notification mode */
skel->bss->flags = 0;
/* produce new samples, no notification should be triggered, because
* consumer is now behind
*/
trigger_samples();
/* background poll should still be blocked */
err = pthread_tryjoin_np(thread, (void **)&bg_ret);
if (CHECK(err != EBUSY, "try_join", "err %d\n", err))
goto cleanup;
/* now force notifications */
skel->bss->flags = BPF_RB_FORCE_WAKEUP;
sample_cnt = 0;
trigger_samples();
/* now we should get a pending notification */
usleep(50000);
err = pthread_tryjoin_np(thread, (void **)&bg_ret);
if (CHECK(err, "join_bg", "err %d\n", err))
goto cleanup;
if (CHECK(bg_ret != 1, "bg_ret", "epoll_wait result: %ld", bg_ret))
goto cleanup;
/* 3 rounds, 2 samples each */
CHECK(sample_cnt != 6, "wrong_sample_cnt",
"expected to see %d samples, got %d\n", 6, sample_cnt);
/* BPF side did everything right */
CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
0L, skel->bss->dropped);
CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
2L, skel->bss->total);
CHECK(skel->bss->discarded != 1, "err_discarded", "exp %ld, got %ld\n",
1L, skel->bss->discarded);
test_ringbuf__detach(skel);
cleanup:
ring_buffer__free(ringbuf);
test_ringbuf__destroy(skel);
}
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <test_progs.h>
#include <sys/epoll.h>
#include "test_ringbuf_multi.skel.h"
static int duration = 0;
struct sample {
int pid;
int seq;
long value;
char comm[16];
};
static int process_sample(void *ctx, void *data, size_t len)
{
int ring = (unsigned long)ctx;
struct sample *s = data;
switch (s->seq) {
case 0:
CHECK(ring != 1, "sample1_ring", "exp %d, got %d\n", 1, ring);
CHECK(s->value != 333, "sample1_value", "exp %ld, got %ld\n",
333L, s->value);
break;
case 1:
CHECK(ring != 2, "sample2_ring", "exp %d, got %d\n", 2, ring);
CHECK(s->value != 777, "sample2_value", "exp %ld, got %ld\n",
777L, s->value);
break;
default:
CHECK(true, "extra_sample", "unexpected sample seq %d, val %ld\n",
s->seq, s->value);
return -1;
}
return 0;
}
void test_ringbuf_multi(void)
{
struct test_ringbuf_multi *skel;
struct ring_buffer *ringbuf;
int err;
skel = test_ringbuf_multi__open_and_load();
if (CHECK(!skel, "skel_open_load", "skeleton open&load failed\n"))
return;
/* only trigger BPF program for current process */
skel->bss->pid = getpid();
ringbuf = ring_buffer__new(bpf_map__fd(skel->maps.ringbuf1),
process_sample, (void *)(long)1, NULL);
if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n"))
goto cleanup;
err = ring_buffer__add(ringbuf, bpf_map__fd(skel->maps.ringbuf2),
process_sample, (void *)(long)2);
if (CHECK(err, "ringbuf_add", "failed to add another ring\n"))
goto cleanup;
err = test_ringbuf_multi__attach(skel);
if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err))
goto cleanup;
/* trigger few samples, some will be skipped */
skel->bss->target_ring = 0;
skel->bss->value = 333;
syscall(__NR_getpgid);
/* skipped, no ringbuf in slot 1 */
skel->bss->target_ring = 1;
skel->bss->value = 555;
syscall(__NR_getpgid);
skel->bss->target_ring = 2;
skel->bss->value = 777;
syscall(__NR_getpgid);
/* poll for samples, should get 2 ringbufs back */
err = ring_buffer__poll(ringbuf, -1);
if (CHECK(err != 4, "poll_res", "expected 4 records, got %d\n", err))
goto cleanup;
/* expect extra polling to return nothing */
err = ring_buffer__poll(ringbuf, 0);
if (CHECK(err < 0, "extra_samples", "poll result: %d\n", err))
goto cleanup;
CHECK(skel->bss->dropped != 0, "err_dropped", "exp %ld, got %ld\n",
0L, skel->bss->dropped);
CHECK(skel->bss->skipped != 1, "err_skipped", "exp %ld, got %ld\n",
1L, skel->bss->skipped);
CHECK(skel->bss->total != 2, "err_total", "exp %ld, got %ld\n",
2L, skel->bss->total);
cleanup:
ring_buffer__free(ringbuf);
test_ringbuf_multi__destroy(skel);
}
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <linux/bpf.h>
#include <stdint.h>
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
struct {
__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
__uint(value_size, sizeof(int));
__uint(key_size, sizeof(int));
} perfbuf SEC(".maps");
const volatile int batch_cnt = 0;
long sample_val = 42;
long dropped __attribute__((aligned(128))) = 0;
SEC("fentry/__x64_sys_getpgid")
int bench_perfbuf(void *ctx)
{
__u64 *sample;
int i;
for (i = 0; i < batch_cnt; i++) {
if (bpf_perf_event_output(ctx, &perfbuf, BPF_F_CURRENT_CPU,
&sample_val, sizeof(sample_val)))
__sync_add_and_fetch(&dropped, 1);
}
return 0;
}
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <linux/bpf.h>
#include <stdint.h>
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
} ringbuf SEC(".maps");
const volatile int batch_cnt = 0;
const volatile long use_output = 0;
long sample_val = 42;
long dropped __attribute__((aligned(128))) = 0;
const volatile long wakeup_data_size = 0;
static __always_inline long get_flags()
{
long sz;
if (!wakeup_data_size)
return 0;
sz = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA);
return sz >= wakeup_data_size ? BPF_RB_FORCE_WAKEUP : BPF_RB_NO_WAKEUP;
}
SEC("fentry/__x64_sys_getpgid")
int bench_ringbuf(void *ctx)
{
long *sample, flags;
int i;
if (!use_output) {
for (i = 0; i < batch_cnt; i++) {
sample = bpf_ringbuf_reserve(&ringbuf,
sizeof(sample_val), 0);
if (!sample) {
__sync_add_and_fetch(&dropped, 1);
} else {
*sample = sample_val;
flags = get_flags();
bpf_ringbuf_submit(sample, flags);
}
}
} else {
for (i = 0; i < batch_cnt; i++) {
flags = get_flags();
if (bpf_ringbuf_output(&ringbuf, &sample_val,
sizeof(sample_val), flags))
__sync_add_and_fetch(&dropped, 1);
}
}
return 0;
}
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
struct sample {
int pid;
int seq;
long value;
char comm[16];
};
struct {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 12);
} ringbuf SEC(".maps");
/* inputs */
int pid = 0;
long value = 0;
long flags = 0;
/* outputs */
long total = 0;
long discarded = 0;
long dropped = 0;
long avail_data = 0;
long ring_size = 0;
long cons_pos = 0;
long prod_pos = 0;
/* inner state */
long seq = 0;
SEC("tp/syscalls/sys_enter_getpgid")
int test_ringbuf(void *ctx)
{
int cur_pid = bpf_get_current_pid_tgid() >> 32;
struct sample *sample;
int zero = 0;
if (cur_pid != pid)
return 0;
sample = bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0);
if (!sample) {
__sync_fetch_and_add(&dropped, 1);
return 1;
}
sample->pid = pid;
bpf_get_current_comm(sample->comm, sizeof(sample->comm));
sample->value = value;
sample->seq = seq++;
__sync_fetch_and_add(&total, 1);
if (sample->seq & 1) {
/* copy from reserved sample to a new one... */
bpf_ringbuf_output(&ringbuf, sample, sizeof(*sample), flags);
/* ...and then discard reserved sample */
bpf_ringbuf_discard(sample, flags);
__sync_fetch_and_add(&discarded, 1);
} else {
bpf_ringbuf_submit(sample, flags);
}
avail_data = bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA);
ring_size = bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE);
cons_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS);
prod_pos = bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS);
return 0;
}
// SPDX-License-Identifier: GPL-2.0
// Copyright (c) 2020 Facebook
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
char _license[] SEC("license") = "GPL";
struct sample {
int pid;
int seq;
long value;
char comm[16];
};
struct ringbuf_map {
__uint(type, BPF_MAP_TYPE_RINGBUF);
__uint(max_entries, 1 << 12);
} ringbuf1 SEC(".maps"),
ringbuf2 SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS);
__uint(max_entries, 4);
__type(key, int);
__array(values, struct ringbuf_map);
} ringbuf_arr SEC(".maps") = {
.values = {
[0] = &ringbuf1,
[2] = &ringbuf2,
},
};
/* inputs */
int pid = 0;
int target_ring = 0;
long value = 0;
/* outputs */
long total = 0;
long dropped = 0;
long skipped = 0;
SEC("tp/syscalls/sys_enter_getpgid")
int test_ringbuf(void *ctx)
{
int cur_pid = bpf_get_current_pid_tgid() >> 32;
struct sample *sample;
void *rb;
int zero = 0;
if (cur_pid != pid)
return 0;
rb = bpf_map_lookup_elem(&ringbuf_arr, &target_ring);
if (!rb) {
skipped += 1;
return 1;
}
sample = bpf_ringbuf_reserve(rb, sizeof(*sample), 0);
if (!sample) {
dropped += 1;
return 1;
}
sample->pid = pid;
bpf_get_current_comm(sample->comm, sizeof(sample->comm));
sample->value = value;
sample->seq = total;
total += 1;
bpf_ringbuf_submit(sample, 0);
return 0;
}
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 3 }, .fixup_map_hash_48b = { 3 },
.errstr = "R0 max value is outside of the array range", .errstr = "R0 max value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
}, },
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 3 }, .fixup_map_hash_48b = { 3 },
.errstr = "R0 max value is outside of the array range", .errstr = "R0 max value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
}, },
...@@ -117,7 +117,7 @@ ...@@ -117,7 +117,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 3 }, .fixup_map_hash_48b = { 3 },
.errstr = "R0 min value is outside of the array range", .errstr = "R0 min value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
}, },
...@@ -137,7 +137,7 @@ ...@@ -137,7 +137,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 3 }, .fixup_map_hash_48b = { 3 },
.errstr = "R0 unbounded memory access, make sure to bounds check any array access into a map", .errstr = "R0 unbounded memory access, make sure to bounds check any such access",
.result = REJECT, .result = REJECT,
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
}, },
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_8b = { 3 }, .fixup_map_hash_8b = { 3 },
.errstr = "R0 max value is outside of the array range", .errstr = "R0 max value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
}, },
{ {
...@@ -146,7 +146,7 @@ ...@@ -146,7 +146,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_8b = { 3 }, .fixup_map_hash_8b = { 3 },
.errstr = "R0 min value is outside of the array range", .errstr = "R0 min value is outside of the allowed memory range",
.result = REJECT .result = REJECT
}, },
{ {
...@@ -354,7 +354,7 @@ ...@@ -354,7 +354,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_8b = { 3 }, .fixup_map_hash_8b = { 3 },
.errstr = "R0 max value is outside of the array range", .errstr = "R0 max value is outside of the allowed memory range",
.result = REJECT .result = REJECT
}, },
{ {
......
...@@ -105,7 +105,7 @@ ...@@ -105,7 +105,7 @@
.prog_type = BPF_PROG_TYPE_SCHED_CLS, .prog_type = BPF_PROG_TYPE_SCHED_CLS,
.fixup_map_hash_8b = { 16 }, .fixup_map_hash_8b = { 16 },
.result = REJECT, .result = REJECT,
.errstr = "R0 min value is outside of the array range", .errstr = "R0 min value is outside of the allowed memory range",
}, },
{ {
"calls: overlapping caller/callee", "calls: overlapping caller/callee",
......
...@@ -68,7 +68,7 @@ ...@@ -68,7 +68,7 @@
}, },
.fixup_map_array_48b = { 1 }, .fixup_map_array_48b = { 1 },
.result = REJECT, .result = REJECT,
.errstr = "R1 min value is outside of the array range", .errstr = "R1 min value is outside of the allowed memory range",
}, },
{ {
"direct map access, write test 7", "direct map access, write test 7",
...@@ -220,7 +220,7 @@ ...@@ -220,7 +220,7 @@
}, },
.fixup_map_array_small = { 1 }, .fixup_map_array_small = { 1 },
.result = REJECT, .result = REJECT,
.errstr = "R1 min value is outside of the array range", .errstr = "R1 min value is outside of the allowed memory range",
}, },
{ {
"direct map access, write test 19", "direct map access, write test 19",
......
...@@ -318,7 +318,7 @@ ...@@ -318,7 +318,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 4 }, .fixup_map_hash_48b = { 4 },
.errstr = "R1 min value is outside of the array range", .errstr = "R1 min value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT, .prog_type = BPF_PROG_TYPE_TRACEPOINT,
}, },
......
...@@ -280,7 +280,7 @@ ...@@ -280,7 +280,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 3 }, .fixup_map_hash_48b = { 3 },
.errstr = "R1 min value is outside of the array range", .errstr = "R1 min value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT, .prog_type = BPF_PROG_TYPE_TRACEPOINT,
}, },
...@@ -415,7 +415,7 @@ ...@@ -415,7 +415,7 @@
BPF_EXIT_INSN(), BPF_EXIT_INSN(),
}, },
.fixup_map_hash_48b = { 3 }, .fixup_map_hash_48b = { 3 },
.errstr = "R1 min value is outside of the array range", .errstr = "R1 min value is outside of the allowed memory range",
.result = REJECT, .result = REJECT,
.prog_type = BPF_PROG_TYPE_TRACEPOINT, .prog_type = BPF_PROG_TYPE_TRACEPOINT,
}, },
...@@ -926,7 +926,7 @@ ...@@ -926,7 +926,7 @@
}, },
.fixup_map_hash_16b = { 3, 10 }, .fixup_map_hash_16b = { 3, 10 },
.result = REJECT, .result = REJECT,
.errstr = "R2 unbounded memory access, make sure to bounds check any array access into a map", .errstr = "R2 unbounded memory access, make sure to bounds check any such access",
.prog_type = BPF_PROG_TYPE_TRACEPOINT, .prog_type = BPF_PROG_TYPE_TRACEPOINT,
}, },
{ {
......
...@@ -50,7 +50,7 @@ ...@@ -50,7 +50,7 @@
.fixup_map_array_48b = { 8 }, .fixup_map_array_48b = { 8 },
.result = ACCEPT, .result = ACCEPT,
.result_unpriv = REJECT, .result_unpriv = REJECT,
.errstr_unpriv = "R0 min value is outside of the array range", .errstr_unpriv = "R0 min value is outside of the allowed memory range",
.retval = 1, .retval = 1,
}, },
{ {
...@@ -325,7 +325,7 @@ ...@@ -325,7 +325,7 @@
}, },
.fixup_map_array_48b = { 3 }, .fixup_map_array_48b = { 3 },
.result = REJECT, .result = REJECT,
.errstr = "R0 min value is outside of the array range", .errstr = "R0 min value is outside of the allowed memory range",
.result_unpriv = REJECT, .result_unpriv = REJECT,
.errstr_unpriv = "R0 pointer arithmetic of map value goes out of range", .errstr_unpriv = "R0 pointer arithmetic of map value goes out of range",
}, },
...@@ -601,7 +601,7 @@ ...@@ -601,7 +601,7 @@
}, },
.fixup_map_array_48b = { 3 }, .fixup_map_array_48b = { 3 },
.result = REJECT, .result = REJECT,
.errstr = "R1 max value is outside of the array range", .errstr = "R1 max value is outside of the allowed memory range",
.errstr_unpriv = "R1 pointer arithmetic of map value goes out of range", .errstr_unpriv = "R1 pointer arithmetic of map value goes out of range",
.flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS,
}, },
...@@ -726,7 +726,7 @@ ...@@ -726,7 +726,7 @@
}, },
.fixup_map_array_48b = { 3 }, .fixup_map_array_48b = { 3 },
.result = REJECT, .result = REJECT,
.errstr = "R0 min value is outside of the array range", .errstr = "R0 min value is outside of the allowed memory range",
}, },
{ {
"map access: value_ptr -= known scalar, 2", "map access: value_ptr -= known scalar, 2",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment