Commit 795c4e77 authored by yonghong-song's avatar yonghong-song

Merge pull request #282 from iovisor/bblanco_dev

Add perf_output support for high rate events
parents ea7962e0 98d0bebb
#!/usr/bin/env python
# Copyright (c) PLUMgrid, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")
# This is an example of tracing an event and printing custom fields.
# run in project examples directory with:
# sudo ./trace_fields.py"
import atexit
from bcc import BPF
import ctypes as ct
class Data(ct.Structure):
_fields_ = [("ts", ct.c_ulonglong),
("magic", ct.c_ulonglong)]
counter = 0
def cb(cpu, data, size):
assert size >= ct.sizeof(Data)
event = ct.cast(data, ct.POINTER(Data)).contents
print("[%0d] %f: %x" % (cpu, float(event.ts) / 1000000, event.magic))
global counter
counter += 1
prog = """
BPF_PERF_OUTPUT(events);
BPF_TABLE("array", int, u64, counters, 10);
int kprobe__sys_clone(void *ctx) {
struct {
u64 ts;
u64 magic;
} data = {bpf_ktime_get_ns(), 0x12345678};
int rc;
if ((rc = events.perf_submit(ctx, &data, sizeof(data))) < 0)
bpf_trace_printk("perf_output failed: %d\\n", rc);
int zero = 0;
u64 *val = counters.lookup(&zero);
if (val) lock_xadd(val, 1);
return 0;
}
"""
b = BPF(text=prog)
b["events"].open_perf_buffer(cb)
@atexit.register
def print_counter():
global counter
global b
print("counter = %d vs %d" % (counter, b["counters"][ct.c_int(0)].value))
print("Tracing sys_write, try `dd if=/dev/zero of=/dev/null`")
print("Tracing... Hit Ctrl-C to end.")
while 1:
b.kprobe_poll()
......@@ -63,50 +63,16 @@ struct bpf_insn {
__s32 imm; /* signed immediate constant */
};
/* BPF syscall commands */
/* BPF syscall commands, see bpf(2) man-page for details. */
enum bpf_cmd {
/* create a map with given type and attributes
* fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
* returns fd or negative error
* map is deleted when fd is closed
*/
BPF_MAP_CREATE,
/* lookup key in a given map
* err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
* Using attr->map_fd, attr->key, attr->value
* returns zero and stores found elem into value
* or negative error
*/
BPF_MAP_LOOKUP_ELEM,
/* create or update key/value pair in a given map
* err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
* Using attr->map_fd, attr->key, attr->value, attr->flags
* returns zero or negative error
*/
BPF_MAP_UPDATE_ELEM,
/* find and delete elem by key in a given map
* err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
* Using attr->map_fd, attr->key
* returns zero or negative error
*/
BPF_MAP_DELETE_ELEM,
/* lookup key in a given map and return next key
* err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
* Using attr->map_fd, attr->key, attr->next_key
* returns zero and stores next key or negative error
*/
BPF_MAP_GET_NEXT_KEY,
/* verify and load eBPF program
* prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size)
* Using attr->prog_type, attr->insns, attr->license
* returns fd or negative error
*/
BPF_PROG_LOAD,
BPF_OBJ_PIN,
BPF_OBJ_GET,
};
enum bpf_map_type {
......@@ -114,6 +80,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_HASH,
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
};
enum bpf_prog_type {
......@@ -159,6 +126,11 @@ union bpf_attr {
__aligned_u64 log_buf; /* user supplied buffer */
__u32 kern_version; /* checked when prog_type=kprobe */
};
struct { /* anonymous struct used by BPF_OBJ_* commands */
__aligned_u64 pathname;
__u32 bpf_fd;
};
} __attribute__((aligned(8)));
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
......@@ -270,6 +242,33 @@ enum bpf_func_id {
*/
BPF_FUNC_skb_get_tunnel_key,
BPF_FUNC_skb_set_tunnel_key,
BPF_FUNC_perf_event_read, /* u64 bpf_perf_event_read(&map, index) */
/**
* bpf_redirect(ifindex, flags) - redirect to another netdev
* @ifindex: ifindex of the net device
* @flags: bit 0 - if set, redirect to ingress instead of egress
* other bits - reserved
* Return: TC_ACT_REDIRECT
*/
BPF_FUNC_redirect,
/**
* bpf_get_route_realm(skb) - retrieve a dst's tclassid
* @skb: pointer to skb
* Return: realm if != 0
*/
BPF_FUNC_get_route_realm,
/**
* bpf_perf_event_output(ctx, map, index, data, size) - output perf raw sample
* @ctx: struct pt_regs*
* @map: pointer to perf_event_array map
* @index: index of event in the map
* @data: data on stack to be output as raw data
* @size: size of data
* Return: 0 on success
*/
BPF_FUNC_perf_event_output,
__BPF_FUNC_MAX_ID,
};
......@@ -290,6 +289,8 @@ struct __sk_buff {
__u32 ifindex;
__u32 tc_index;
__u32 cb[5];
__u32 hash;
__u32 tc_classid;
};
struct bpf_tunnel_key {
......
......@@ -42,6 +42,30 @@ struct _name##_table_t { \
__attribute__((section("maps/" _table_type))) \
struct _name##_table_t _name
// Table for pushing custom events to userspace via ring buffer
#define BPF_PERF_OUTPUT(_name) \
struct _name##_table_t { \
int key; \
u32 leaf; \
/* map.perf_submit(ctx, data, data_size) */ \
int (*perf_submit) (void *, void *, u32); \
u32 data[0]; \
}; \
__attribute__((section("maps/perf_output"))) \
struct _name##_table_t _name
// Table for reading hw perf cpu counters
#define BPF_PERF_ARRAY(_name, _max_entries) \
struct _name##_table_t { \
int key; \
u32 leaf; \
/* counter = map.perf_read(index) */ \
u64 (*perf_read) (int); \
u32 data[_max_entries]; \
}; \
__attribute__((section("maps/perf_array"))) \
struct _name##_table_t _name
#define BPF_HASH1(_name) \
BPF_TABLE("hash", u64, u64, _name, 10240)
#define BPF_HASH2(_name, _key_type) \
......@@ -95,7 +119,7 @@ int bpf_trace_printk(const char *fmt, ...) asm("llvm.bpf.extra");
static void bpf_tail_call_(u64 map_fd, void *ctx, int index) {
((void (*)(void *, u64, int))BPF_FUNC_tail_call)(ctx, map_fd, index);
}
static int (*bpf_clone_redirect)(void *ctx, u64 ifindex, u64 flags) =
static int (*bpf_clone_redirect)(void *ctx, int ifindex, u32 flags) =
(void *) BPF_FUNC_clone_redirect;
static u64 (*bpf_get_smp_processor_id)(void) =
(void *) BPF_FUNC_get_smp_processor_id;
......@@ -105,6 +129,7 @@ static u64 (*bpf_get_current_uid_gid)(void) =
(void *) BPF_FUNC_get_current_uid_gid;
static int (*bpf_get_current_comm)(void *buf, int buf_size) =
(void *) BPF_FUNC_get_current_comm;
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,3,0)
static u64 (*bpf_get_cgroup_classid)(void *ctx) =
(void *) BPF_FUNC_get_cgroup_classid;
......@@ -116,7 +141,16 @@ static int (*bpf_skb_get_tunnel_key)(void *ctx, void *to, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_get_tunnel_key;
static int (*bpf_skb_set_tunnel_key)(void *ctx, void *from, u32 size, u64 flags) =
(void *) BPF_FUNC_skb_set_tunnel_key;
static int (*bpf_perf_event_read)(void *map, u32 index) =
(void *) BPF_FUNC_perf_event_read;
#endif
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
static int (*bpf_redirect)(int ifindex, u32 flags) =
(void *) BPF_FUNC_redirect;
static u32 (*bpf_get_route_realm)(void *ctx) =
(void *) BPF_FUNC_get_route_realm;
static int (*bpf_perf_event_output)(void *ctx, void *map, u32 index, void *data, u32 size) =
(void *) BPF_FUNC_perf_event_output;
#endif
/* llvm builtin functions that eBPF C program may use to
......@@ -341,6 +375,7 @@ int bpf_l4_csum_replace_(void *ctx, u64 off, u64 from, u64 to, u64 flags) {
int incr_cksum_l3(void *off, u64 oldval, u64 newval) asm("llvm.bpf.extra");
int incr_cksum_l4(void *off, u64 oldval, u64 newval, u64 flags) asm("llvm.bpf.extra");
int bpf_num_cpus() asm("llvm.bpf.extra");
#define lock_xadd(ptr, val) ((void)__sync_fetch_and_add(ptr, val))
......
......@@ -16,6 +16,7 @@
#include <linux/bpf.h>
#include <linux/version.h>
#include <sys/utsname.h>
#include <unistd.h>
#include <clang/AST/ASTConsumer.h>
#include <clang/AST/ASTContext.h>
......@@ -332,6 +333,14 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
}
txt += "typeof(" + name + ".leaf) *_leaf = " + lookup + ", &_key); ";
txt += "if (_leaf) (*_leaf)++; })";
} else if (memb_name == "perf_submit") {
string name = Ref->getDecl()->getName();
string arg0 = rewriter_.getRewrittenText(SourceRange(Call->getArg(0)->getLocStart(),
Call->getArg(0)->getLocEnd()));
string args_other = rewriter_.getRewrittenText(SourceRange(Call->getArg(1)->getLocStart(),
Call->getArg(2)->getLocEnd()));
txt = "bpf_perf_event_output(" + arg0 + ", bpf_pseudo_fd(1, " + fd + ")";
txt += ", bpf_get_smp_processor_id(), " + args_other + ")";
} else {
if (memb_name == "lookup") {
prefix = "bpf_map_lookup_elem";
......@@ -345,6 +354,9 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
} else if (memb_name == "call") {
prefix = "bpf_tail_call_";
suffix = ")";
} else if (memb_name == "perf_read") {
prefix = "bpf_perf_event_read";
suffix = ")";
} else {
C.getDiagnostics().Report(Call->getLocStart(), diag::err_expected)
<< "valid bpf_table operation";
......@@ -403,6 +415,12 @@ bool BTypeVisitor::VisitCallExpr(CallExpr *Call) {
rewriter_.ReplaceText(SourceRange(Call->getLocStart(), Call->getArg(0)->getLocEnd()), text);
rewriter_.InsertTextAfter(Call->getLocEnd(), "); }");
}
} else if (Decl->getName() == "bpf_num_cpus") {
int numcpu = sysconf(_SC_NPROCESSORS_ONLN);
if (numcpu <= 0)
numcpu = 1;
text = to_string(numcpu);
rewriter_.ReplaceText(SourceRange(Call->getLocStart(), Call->getLocEnd()), text);
}
}
}
......@@ -482,6 +500,13 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
}
const RecordDecl *RD = R->getDecl()->getDefinition();
int major = 0, minor = 0;
struct utsname un;
if (uname(&un) == 0) {
// release format: <major>.<minor>.<revision>[-<othertag>]
sscanf(un.release, "%d.%d.", &major, &minor);
}
TableDesc table;
table.name = Decl->getName();
......@@ -519,20 +544,27 @@ bool BTypeVisitor::VisitVarDecl(VarDecl *Decl) {
diag_.Report(Decl->getLocStart(), diag_id) << table.leaf_desc;
}
} else if (A->getName() == "maps/prog") {
struct utsname un;
if (uname(&un) == 0) {
int major = 0, minor = 0;
// release format: <major>.<minor>.<revision>[-<othertag>]
sscanf(un.release, "%d.%d.", &major, &minor);
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,2,0))
map_type = BPF_MAP_TYPE_PROG_ARRAY;
} else if (A->getName() == "maps/perf_output") {
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,3,0))
map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY;
int numcpu = sysconf(_SC_NPROCESSORS_ONLN);
if (numcpu <= 0)
numcpu = 1;
table.max_entries = numcpu;
} else if (A->getName() == "maps/perf_array") {
if (KERNEL_VERSION(major,minor,0) >= KERNEL_VERSION(4,3,0))
map_type = BPF_MAP_TYPE_PERF_EVENT_ARRAY;
}
if (map_type == BPF_MAP_TYPE_UNSPEC) {
C.getDiagnostics().Report(Decl->getLocStart(), diag::err_expected)
<< "kernel supporting maps/prog";
unsigned diag_id = C.getDiagnostics().getCustomDiagID(DiagnosticsEngine::Error,
"unsupported map type: %0");
C.getDiagnostics().Report(Decl->getLocStart(), diag_id) << A->getName();
return false;
}
}
table.type = map_type;
table.fd = bpf_create_map(map_type, table.key_size, table.leaf_size, table.max_entries);
if (table.fd < 0) {
......
......@@ -178,8 +178,8 @@ int bpf_attach_socket(int sock, int prog) {
static int bpf_attach_tracing_event(int progfd, const char *event_path,
struct perf_reader *reader, int pid, int cpu, int group_fd) {
int efd = -1, rc = -1, pfd = -1;
ssize_t bytes = -1;
int efd = -1, pfd;
ssize_t bytes;
char buf[256];
struct perf_event_attr attr = {};
......@@ -187,13 +187,13 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
efd = open(buf, O_RDONLY, 0);
if (efd < 0) {
fprintf(stderr, "open(%s): %s\n", buf, strerror(errno));
goto cleanup;
goto error;
}
bytes = read(efd, buf, sizeof(buf));
if (bytes <= 0 || bytes >= sizeof(buf)) {
fprintf(stderr, "read(%s): %s\n", buf, strerror(errno));
goto cleanup;
goto error;
}
buf[bytes] = '\0';
attr.config = strtol(buf, NULL, 0);
......@@ -204,91 +204,133 @@ static int bpf_attach_tracing_event(int progfd, const char *event_path,
pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, group_fd, PERF_FLAG_FD_CLOEXEC);
if (pfd < 0) {
perror("perf_event_open");
goto cleanup;
goto error;
}
perf_reader_set_fd(reader, pfd);
if (perf_reader_mmap(reader, pfd, attr.sample_type) < 0)
goto cleanup;
if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
goto error;
if (ioctl(pfd, PERF_EVENT_IOC_SET_BPF, progfd) < 0) {
perror("ioctl(PERF_EVENT_IOC_SET_BPF)");
goto cleanup;
goto error;
}
if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
perror("ioctl(PERF_EVENT_IOC_ENABLE)");
goto cleanup;
goto error;
}
rc = pfd;
pfd = -1;
return 0;
cleanup:
error:
if (efd >= 0)
close(efd);
if (pfd >= 0)
close(pfd);
return rc;
return -1;
}
void * bpf_attach_kprobe(int progfd, const char *event,
const char *event_desc, pid_t pid,
int cpu, int group_fd, perf_reader_cb cb,
void *cb_cookie) {
int rc = -1, kfd = -1;
int kfd = -1;
char buf[256];
struct perf_reader *reader = NULL;
reader = perf_reader_new(-1, 8, cb, cb_cookie);
reader = perf_reader_new(cb, NULL, cb_cookie);
if (!reader)
goto cleanup;
goto error;
kfd = open("/sys/kernel/debug/tracing/kprobe_events", O_WRONLY | O_APPEND, 0);
if (kfd < 0) {
perror("open(kprobe_events)");
goto cleanup;
goto error;
}
if (write(kfd, event_desc, strlen(event_desc)) < 0) {
fprintf(stderr, "write of \"%s\" into kprobe_events failed: %s\n", event_desc, strerror(errno));
if (errno == EINVAL)
fprintf(stderr, "check dmesg output for possible cause\n");
goto cleanup;
goto error;
}
snprintf(buf, sizeof(buf), "/sys/kernel/debug/tracing/events/kprobes/%s", event);
rc = bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd);
if (bpf_attach_tracing_event(progfd, buf, reader, pid, cpu, group_fd) < 0)
goto error;
return reader;
cleanup:
error:
if (kfd >= 0)
close(kfd);
if (reader && rc < 0) {
if (reader)
perf_reader_free(reader);
reader = NULL;
}
return reader;
return NULL;
}
int bpf_detach_kprobe(const char *event_desc) {
int rc = -1, kfd = -1;
int kfd = -1;
kfd = open("/sys/kernel/debug/tracing/kprobe_events", O_WRONLY | O_APPEND, 0);
if (kfd < 0) {
perror("open(kprobe_events)");
goto cleanup;
goto error;
}
if (write(kfd, event_desc, strlen(event_desc)) < 0) {
perror("write(kprobe_events)");
goto cleanup;
goto error;
}
rc = 0;
cleanup:
return 0;
error:
if (kfd >= 0)
close(kfd);
return rc;
return -1;
}
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu) {
#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,4,0)
int pfd;
struct perf_event_attr attr = {};
struct perf_reader *reader = NULL;
reader = perf_reader_new(NULL, raw_cb, cb_cookie);
if (!reader)
goto error;
attr.config = PERF_COUNT_SW_BPF_OUTPUT;
attr.type = PERF_TYPE_SOFTWARE;
attr.sample_type = PERF_SAMPLE_RAW;
attr.sample_period = 1;
attr.wakeup_events = 1;
pfd = syscall(__NR_perf_event_open, &attr, pid, cpu, -1, PERF_FLAG_FD_CLOEXEC);
if (pfd < 0) {
perror("perf_event_open");
goto error;
}
perf_reader_set_fd(reader, pfd);
if (perf_reader_mmap(reader, attr.type, attr.sample_type) < 0)
goto error;
if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {
perror("ioctl(PERF_EVENT_IOC_ENABLE)");
goto error;
}
return reader;
error:
if (reader)
perf_reader_free(reader);
return NULL;
#else
fprintf(stderr, "PERF_COUNT_SW_BPF_OUTPUT feature unsupported\n");
return NULL;
#endif
}
......@@ -26,8 +26,11 @@
#include "libbpf.h"
#include "perf_reader.h"
int perf_reader_page_cnt = 8;
struct perf_reader {
perf_reader_cb cb;
perf_reader_raw_cb raw_cb;
void *cb_cookie; // to be returned in the cb
void *buf; // for keeping segmented data
size_t buf_size;
......@@ -35,18 +38,20 @@ struct perf_reader {
int page_size;
int page_cnt;
int fd;
uint32_t type;
uint64_t sample_type;
};
struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie) {
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie) {
struct perf_reader *reader = calloc(1, sizeof(struct perf_reader));
if (!reader)
return NULL;
reader->cb = cb;
reader->raw_cb = raw_cb;
reader->cb_cookie = cb_cookie;
reader->fd = fd;
reader->fd = -1;
reader->page_size = getpagesize();
reader->page_cnt = page_cnt;
reader->page_cnt = perf_reader_page_cnt;
return reader;
}
......@@ -61,18 +66,20 @@ void perf_reader_free(void *ptr) {
}
}
int perf_reader_mmap(struct perf_reader *reader, int fd, uint64_t sample_type) {
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type) {
int mmap_size = reader->page_size * (reader->page_cnt + 1);
if (!reader->cb)
return 0;
if (reader->fd < 0) {
fprintf(stderr, "%s: reader fd is not set\n", __FUNCTION__);
return -1;
}
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, fd, 0);
reader->base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE , MAP_SHARED, reader->fd, 0);
if (reader->base == MAP_FAILED) {
perror("mmap");
return -1;
}
reader->fd = fd;
reader->type = type;
reader->sample_type = sample_type;
return 0;
......@@ -90,7 +97,7 @@ struct perf_sample_trace_kprobe {
uint64_t ip;
};
static void sample_parse(struct perf_reader *reader, void *data, int size) {
static void parse_tracepoint(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;
......@@ -153,6 +160,40 @@ static void sample_parse(struct perf_reader *reader, void *data, int size) {
reader->cb(reader->cb_cookie, tk ? tk->common.pid : -1, num_callchain, callchain);
}
static void parse_sw(struct perf_reader *reader, void *data, int size) {
uint8_t *ptr = data;
struct perf_event_header *header = (void *)data;
struct {
uint32_t size;
char data[0];
} *raw = NULL;
ptr += sizeof(*header);
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt sample header\n", __FUNCTION__);
return;
}
if (reader->sample_type & PERF_SAMPLE_RAW) {
raw = (void *)ptr;
ptr += sizeof(raw->size) + raw->size;
if (ptr > (uint8_t *)data + size) {
fprintf(stderr, "%s: corrupt raw sample\n", __FUNCTION__);
return;
}
}
// sanity check
if (ptr != (uint8_t *)data + size) {
fprintf(stderr, "%s: extra data at end of sample\n", __FUNCTION__);
return;
}
if (reader->raw_cb)
reader->raw_cb(reader->cb_cookie, raw->data, raw->size);
}
static uint64_t read_data_head(struct perf_event_mmap_page *perf_header) {
uint64_t data_head = *((volatile uint64_t *)&perf_header->data_head);
asm volatile("" ::: "memory");
......@@ -194,12 +235,16 @@ static void event_read(struct perf_reader *reader) {
ptr = reader->buf;
}
if (e->type == PERF_RECORD_LOST)
if (e->type == PERF_RECORD_LOST) {
fprintf(stderr, "Lost %lu samples\n", *(uint64_t *)(ptr + sizeof(*e)));
else if (e->type == PERF_RECORD_SAMPLE)
sample_parse(reader, ptr, e->size);
else
} else if (e->type == PERF_RECORD_SAMPLE) {
if (reader->type == PERF_TYPE_TRACEPOINT)
parse_tracepoint(reader, ptr, e->size);
else if (reader->type == PERF_TYPE_SOFTWARE)
parse_sw(reader, ptr, e->size);
} else {
fprintf(stderr, "%s: unknown sample type %d\n", __FUNCTION__, e->type);
}
write_data_tail(perf_header, perf_header->data_tail + e->size);
}
......@@ -223,3 +268,10 @@ int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout)
return 0;
}
void perf_reader_set_fd(struct perf_reader *reader, int fd) {
reader->fd = fd;
}
int perf_reader_fd(struct perf_reader *reader) {
return reader->fd;
}
......@@ -16,7 +16,9 @@
struct perf_reader;
struct perf_reader * perf_reader_new(int fd, int page_cnt, perf_reader_cb cb, void *cb_cookie);
struct perf_reader * perf_reader_new(perf_reader_cb cb, perf_reader_raw_cb raw_cb, void *cb_cookie);
void perf_reader_free(void *ptr);
int perf_reader_mmap(struct perf_reader *reader, int fd, unsigned long sample_type);
int perf_reader_mmap(struct perf_reader *reader, unsigned type, unsigned long sample_type);
int perf_reader_poll(int num_readers, struct perf_reader **readers, int timeout);
int perf_reader_fd(struct perf_reader *reader);
void perf_reader_set_fd(struct perf_reader *reader, int fd);
......@@ -42,11 +42,13 @@ int bpf_open_raw_sock(const char *name);
typedef void (*perf_reader_cb)(void *cb_cookie, int pid, uint64_t callchain_num,
void *callchain);
typedef void (*perf_reader_raw_cb)(void *cb_cookie, void *raw, int raw_size);
void * bpf_attach_kprobe(int progfd, const char *event, const char *event_desc,
int pid, int cpu, int group_fd, perf_reader_cb cb,
void *cb_cookie);
int bpf_detach_kprobe(const char *event_desc);
void * bpf_open_perf_buffer(perf_reader_raw_cb raw_cb, void *cb_cookie, int pid, int cpu);
#define LOG_BUF_SIZE 65536
extern char bpf_log_buf[LOG_BUF_SIZE];
......
......@@ -18,6 +18,7 @@ from collections import MutableMapping
import ctypes as ct
import fcntl
import json
import multiprocessing
import os
from subprocess import Popen, PIPE
import sys
......@@ -89,14 +90,19 @@ lib.bpf_prog_load.argtypes = [ct.c_int, ct.c_void_p, ct.c_size_t,
lib.bpf_attach_kprobe.restype = ct.c_void_p
_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_int,
ct.c_ulonglong, ct.POINTER(ct.c_ulonglong))
_RAW_CB_TYPE = ct.CFUNCTYPE(None, ct.py_object, ct.c_void_p, ct.c_int)
lib.bpf_attach_kprobe.argtypes = [ct.c_int, ct.c_char_p, ct.c_char_p, ct.c_int,
ct.c_int, ct.c_int, _CB_TYPE, ct.py_object]
lib.bpf_detach_kprobe.restype = ct.c_int
lib.bpf_detach_kprobe.argtypes = [ct.c_char_p]
lib.bpf_open_perf_buffer.restype = ct.c_void_p
lib.bpf_open_perf_buffer.argtypes = [_RAW_CB_TYPE, ct.py_object, ct.c_int, ct.c_int]
lib.perf_reader_poll.restype = ct.c_int
lib.perf_reader_poll.argtypes = [ct.c_int, ct.POINTER(ct.c_void_p), ct.c_int]
lib.perf_reader_free.restype = None
lib.perf_reader_free.argtypes = [ct.c_void_p]
lib.perf_reader_fd.restype = int
lib.perf_reader_fd.argtypes = [ct.c_void_p]
open_kprobes = {}
tracefile = None
......@@ -111,6 +117,7 @@ stars_max = 40
def cleanup_kprobes():
for k, v in open_kprobes.items():
lib.perf_reader_free(v)
if isinstance(k, str):
desc = "-:kprobes/%s" % k
lib.bpf_detach_kprobe(desc.encode("ascii"))
open_kprobes.clear()
......@@ -126,6 +133,7 @@ class BPF(object):
HASH = 1
ARRAY = 2
PROG_ARRAY = 3
PERF_EVENT_ARRAY = 4
class Function(object):
def __init__(self, bpf, name, fd):
......@@ -141,6 +149,7 @@ class BPF(object):
self.Key = keytype
self.Leaf = leaftype
self.ttype = lib.bpf_table_type_id(self.bpf.module, self.map_id)
self._cbs = {}
def key_sprintf(self, key):
key_p = ct.pointer(key)
......@@ -178,6 +187,35 @@ class BPF(object):
raise Exception("Could not scanf leaf")
return leaf
def open_perf_buffer(self, callback):
"""open_perf_buffers(callback)
Opens a set of per-cpu ring buffer to receive custom perf event
data from the bpf program. The callback will be invoked for each
event submitted from the kernel, up to millions per second.
"""
for i in range(0, multiprocessing.cpu_count()):
self._open_perf_buffer(i, callback)
def _open_perf_buffer(self, cpu, callback):
fn = _RAW_CB_TYPE(lambda _, data, size: callback(cpu, data, size))
reader = lib.bpf_open_perf_buffer(fn, None, -1, cpu)
if not reader:
raise Exception("Could not open perf buffer")
fd = lib.perf_reader_fd(reader)
self[self.Key(cpu)] = self.Leaf(fd)
open_kprobes[(id(self), cpu)] = reader
# keep a refcnt
self._cbs[cpu] = fn
def close_perf_buffer(self, key):
reader = open_kprobes.get((id(self), key))
if reader:
lib.perf_reader_free(reader)
del(open_kprobes[(id(self), key)])
del self._cbs[key]
def __getitem__(self, key):
key_p = ct.pointer(key)
leaf = self.Leaf()
......@@ -208,7 +246,7 @@ class BPF(object):
ttype = lib.bpf_table_type_id(self.bpf.module, self.map_id)
# Deleting from array type maps does not have an effect, so
# zero out the entry instead.
if ttype in (BPF.ARRAY, BPF.PROG_ARRAY):
if ttype in (BPF.ARRAY, BPF.PROG_ARRAY, BPF.PERF_EVENT_ARRAY):
leaf = self.Leaf()
leaf_p = ct.pointer(leaf)
res = lib.bpf_update_elem(self.map_fd,
......@@ -216,6 +254,8 @@ class BPF(object):
ct.cast(leaf_p, ct.c_void_p), 0)
if res < 0:
raise Exception("Could not clear item")
if ttype == BPF.PERF_EVENT_ARRAY:
self.close_perf_buffer(key)
else:
res = lib.bpf_delete_elem(self.map_fd,
ct.cast(key_p, ct.c_void_p))
......@@ -786,11 +826,11 @@ class BPF(object):
Poll from the ring buffers for all of the open kprobes, calling the
cb() that was given in the BPF constructor for each entry.
"""
try:
readers = (ct.c_void_p * len(open_kprobes))()
for i, v in enumerate(open_kprobes.values()):
readers[i] = v
try:
lib.perf_reader_poll(len(open_kprobes), readers, timeout)
except KeyboardInterrupt:
pass
exit()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment