Commit a6ffe7b9 authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by David S. Miller

samples/bpf: offwaketime example

This is simplified version of Brendan Gregg's offwaketime:
This program shows kernel stack traces and task names that were blocked and
"off-CPU", along with the stack traces and task names for the threads that woke
them, and the total elapsed time from when they blocked to when they were woken
up. The combined stacks, task names, and total time is summarized in kernel
context for efficiency.

Example:
$ sudo ./offwaketime | flamegraph.pl > demo.svg
Open demo.svg in the browser as FlameGraph visualization.
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent d5a3b1f6
...@@ -16,6 +16,7 @@ hostprogs-y += tracex5 ...@@ -16,6 +16,7 @@ hostprogs-y += tracex5
hostprogs-y += tracex6 hostprogs-y += tracex6
hostprogs-y += trace_output hostprogs-y += trace_output
hostprogs-y += lathist hostprogs-y += lathist
hostprogs-y += offwaketime
test_verifier-objs := test_verifier.o libbpf.o test_verifier-objs := test_verifier.o libbpf.o
test_maps-objs := test_maps.o libbpf.o test_maps-objs := test_maps.o libbpf.o
...@@ -32,6 +33,7 @@ tracex5-objs := bpf_load.o libbpf.o tracex5_user.o ...@@ -32,6 +33,7 @@ tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
tracex6-objs := bpf_load.o libbpf.o tracex6_user.o tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
trace_output-objs := bpf_load.o libbpf.o trace_output_user.o trace_output-objs := bpf_load.o libbpf.o trace_output_user.o
lathist-objs := bpf_load.o libbpf.o lathist_user.o lathist-objs := bpf_load.o libbpf.o lathist_user.o
offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
# Tell kbuild to always build the programs # Tell kbuild to always build the programs
always := $(hostprogs-y) always := $(hostprogs-y)
...@@ -47,6 +49,7 @@ always += tracex6_kern.o ...@@ -47,6 +49,7 @@ always += tracex6_kern.o
always += trace_output_kern.o always += trace_output_kern.o
always += tcbpf1_kern.o always += tcbpf1_kern.o
always += lathist_kern.o always += lathist_kern.o
always += offwaketime_kern.o
HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(objtree)/usr/include
...@@ -63,6 +66,7 @@ HOSTLOADLIBES_tracex5 += -lelf ...@@ -63,6 +66,7 @@ HOSTLOADLIBES_tracex5 += -lelf
HOSTLOADLIBES_tracex6 += -lelf HOSTLOADLIBES_tracex6 += -lelf
HOSTLOADLIBES_trace_output += -lelf -lrt HOSTLOADLIBES_trace_output += -lelf -lrt
HOSTLOADLIBES_lathist += -lelf HOSTLOADLIBES_lathist += -lelf
HOSTLOADLIBES_offwaketime += -lelf
# point this to your LLVM backend with bpf support # point this to your LLVM backend with bpf support
LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
......
...@@ -39,6 +39,8 @@ static int (*bpf_redirect)(int ifindex, int flags) = ...@@ -39,6 +39,8 @@ static int (*bpf_redirect)(int ifindex, int flags) =
(void *) BPF_FUNC_redirect; (void *) BPF_FUNC_redirect;
static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) = static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) =
(void *) BPF_FUNC_perf_event_output; (void *) BPF_FUNC_perf_event_output;
static int (*bpf_get_stackid)(void *ctx, void *map, int flags) =
(void *) BPF_FUNC_get_stackid;
/* llvm builtin functions that eBPF C program may use to /* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions * emit BPF_LD_ABS and BPF_LD_IND instructions
......
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <uapi/linux/bpf.h>
#include "bpf_helpers.h"
#include <uapi/linux/ptrace.h>
#include <uapi/linux/perf_event.h>
#include <linux/version.h>
#include <linux/sched.h>
#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;})
#define MINBLOCK_US 1
struct key_t {
char waker[TASK_COMM_LEN];
char target[TASK_COMM_LEN];
u32 wret;
u32 tret;
};
struct bpf_map_def SEC("maps") counts = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(struct key_t),
.value_size = sizeof(u64),
.max_entries = 10000,
};
struct bpf_map_def SEC("maps") start = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(u64),
.max_entries = 10000,
};
struct wokeby_t {
char name[TASK_COMM_LEN];
u32 ret;
};
struct bpf_map_def SEC("maps") wokeby = {
.type = BPF_MAP_TYPE_HASH,
.key_size = sizeof(u32),
.value_size = sizeof(struct wokeby_t),
.max_entries = 10000,
};
struct bpf_map_def SEC("maps") stackmap = {
.type = BPF_MAP_TYPE_STACK_TRACE,
.key_size = sizeof(u32),
.value_size = PERF_MAX_STACK_DEPTH * sizeof(u64),
.max_entries = 10000,
};
#define STACKID_FLAGS (0 | BPF_F_FAST_STACK_CMP)
SEC("kprobe/try_to_wake_up")
int waker(struct pt_regs *ctx)
{
struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
struct wokeby_t woke = {};
u32 pid;
pid = _(p->pid);
bpf_get_current_comm(&woke.name, sizeof(woke.name));
woke.ret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
bpf_map_update_elem(&wokeby, &pid, &woke, BPF_ANY);
return 0;
}
static inline int update_counts(struct pt_regs *ctx, u32 pid, u64 delta)
{
struct key_t key = {};
struct wokeby_t *woke;
u64 zero = 0, *val;
bpf_get_current_comm(&key.target, sizeof(key.target));
key.tret = bpf_get_stackid(ctx, &stackmap, STACKID_FLAGS);
woke = bpf_map_lookup_elem(&wokeby, &pid);
if (woke) {
key.wret = woke->ret;
__builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN);
bpf_map_delete_elem(&wokeby, &pid);
}
val = bpf_map_lookup_elem(&counts, &key);
if (!val) {
bpf_map_update_elem(&counts, &key, &zero, BPF_NOEXIST);
val = bpf_map_lookup_elem(&counts, &key);
if (!val)
return 0;
}
(*val) += delta;
return 0;
}
SEC("kprobe/finish_task_switch")
int oncpu(struct pt_regs *ctx)
{
struct task_struct *p = (void *) PT_REGS_PARM1(ctx);
u64 delta, ts, *tsp;
u32 pid;
/* record previous thread sleep time */
pid = _(p->pid);
ts = bpf_ktime_get_ns();
bpf_map_update_elem(&start, &pid, &ts, BPF_ANY);
/* calculate current thread's delta time */
pid = bpf_get_current_pid_tgid();
tsp = bpf_map_lookup_elem(&start, &pid);
if (!tsp)
/* missed start or filtered */
return 0;
delta = bpf_ktime_get_ns() - *tsp;
bpf_map_delete_elem(&start, &pid);
delta = delta / 1000;
if (delta < MINBLOCK_US)
return 0;
return update_counts(ctx, pid, delta);
}
char _license[] SEC("license") = "GPL";
u32 _version SEC("version") = LINUX_VERSION_CODE;
/* Copyright (c) 2016 Facebook
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <signal.h>
#include <linux/bpf.h>
#include <string.h>
#include <linux/perf_event.h>
#include <errno.h>
#include <assert.h>
#include <stdbool.h>
#include <sys/resource.h>
#include "libbpf.h"
#include "bpf_load.h"
#define MAX_SYMS 300000
#define PRINT_RAW_ADDR 0
static struct ksym {
long addr;
char *name;
} syms[MAX_SYMS];
static int sym_cnt;
static int ksym_cmp(const void *p1, const void *p2)
{
return ((struct ksym *)p1)->addr - ((struct ksym *)p2)->addr;
}
static int load_kallsyms(void)
{
FILE *f = fopen("/proc/kallsyms", "r");
char func[256], buf[256];
char symbol;
void *addr;
int i = 0;
if (!f)
return -ENOENT;
while (!feof(f)) {
if (!fgets(buf, sizeof(buf), f))
break;
if (sscanf(buf, "%p %c %s", &addr, &symbol, func) != 3)
break;
if (!addr)
continue;
syms[i].addr = (long) addr;
syms[i].name = strdup(func);
i++;
}
sym_cnt = i;
qsort(syms, sym_cnt, sizeof(struct ksym), ksym_cmp);
return 0;
}
static void *search(long key)
{
int start = 0, end = sym_cnt;
int result;
while (start < end) {
size_t mid = start + (end - start) / 2;
result = key - syms[mid].addr;
if (result < 0)
end = mid;
else if (result > 0)
start = mid + 1;
else
return &syms[mid];
}
if (start >= 1 && syms[start - 1].addr < key &&
key < syms[start].addr)
/* valid ksym */
return &syms[start - 1];
/* out of range. return _stext */
return &syms[0];
}
static void print_ksym(__u64 addr)
{
struct ksym *sym;
if (!addr)
return;
sym = search(addr);
if (PRINT_RAW_ADDR)
printf("%s/%llx;", sym->name, addr);
else
printf("%s;", sym->name);
}
#define TASK_COMM_LEN 16
struct key_t {
char waker[TASK_COMM_LEN];
char target[TASK_COMM_LEN];
__u32 wret;
__u32 tret;
};
static void print_stack(struct key_t *key, __u64 count)
{
__u64 ip[PERF_MAX_STACK_DEPTH] = {};
static bool warned;
int i;
printf("%s;", key->target);
if (bpf_lookup_elem(map_fd[3], &key->tret, ip) != 0) {
printf("---;");
} else {
for (i = PERF_MAX_STACK_DEPTH - 1; i >= 0; i--)
print_ksym(ip[i]);
}
printf("-;");
if (bpf_lookup_elem(map_fd[3], &key->wret, ip) != 0) {
printf("---;");
} else {
for (i = 0; i < PERF_MAX_STACK_DEPTH; i++)
print_ksym(ip[i]);
}
printf(";%s %lld\n", key->waker, count);
if ((key->tret == -EEXIST || key->wret == -EEXIST) && !warned) {
printf("stackmap collisions seen. Consider increasing size\n");
warned = true;
} else if (((int)(key->tret) < 0 || (int)(key->wret) < 0)) {
printf("err stackid %d %d\n", key->tret, key->wret);
}
}
static void print_stacks(int fd)
{
struct key_t key = {}, next_key;
__u64 value;
while (bpf_get_next_key(fd, &key, &next_key) == 0) {
bpf_lookup_elem(fd, &next_key, &value);
print_stack(&next_key, value);
key = next_key;
}
}
static void int_exit(int sig)
{
print_stacks(map_fd[0]);
exit(0);
}
int main(int argc, char **argv)
{
struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
char filename[256];
int delay = 1;
snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
setrlimit(RLIMIT_MEMLOCK, &r);
signal(SIGINT, int_exit);
if (load_kallsyms()) {
printf("failed to process /proc/kallsyms\n");
return 2;
}
if (load_bpf_file(filename)) {
printf("%s", bpf_log_buf);
return 1;
}
if (argc > 1)
delay = atoi(argv[1]);
sleep(delay);
print_stacks(map_fd[0]);
return 0;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment