Commit cf5f5cea authored by Yonghong Song's avatar Yonghong Song Committed by David S. Miller

bpf: add support for sys_enter_* and sys_exit_* tracepoints

Currently, bpf programs cannot be attached to sys_enter_* and sys_exit_*
style tracepoints. The iovisor/bcc issue #748
(https://github.com/iovisor/bcc/issues/748) documents this issue.
For example, if you try to attach a bpf program to tracepoints
syscalls/sys_enter_newfstat, you will get the following error:
   # ./tools/trace.py t:syscalls:sys_enter_newfstat
   Ioctl(PERF_EVENT_IOC_SET_BPF): Invalid argument
   Failed to attach BPF to tracepoint

The main reason is that syscalls/sys_enter_* and syscalls/sys_exit_*
tracepoints are treated differently from other tracepoints and there
is no bpf hook to it.

This patch adds bpf support for these syscalls tracepoints by
  . permitting bpf attachment in ioctl PERF_EVENT_IOC_SET_BPF
  . calling bpf programs in perf_syscall_enter and perf_syscall_exit

The legality of bpf program ctx access is also checked.
Function trace_event_get_offsets returns correct max offset for each
specific syscall tracepoint, which is compared against the maximum offset
access in bpf program.
Signed-off-by: default avatarYonghong Song <yhs@fb.com>
Acked-by: default avatarAlexei Starovoitov <ast@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent d226a2b8
...@@ -172,8 +172,20 @@ extern struct trace_event_functions exit_syscall_print_funcs; ...@@ -172,8 +172,20 @@ extern struct trace_event_functions exit_syscall_print_funcs;
static struct syscall_metadata __used \ static struct syscall_metadata __used \
__attribute__((section("__syscalls_metadata"))) \ __attribute__((section("__syscalls_metadata"))) \
*__p_syscall_meta_##sname = &__syscall_meta_##sname; *__p_syscall_meta_##sname = &__syscall_meta_##sname;
static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
{
return tp_event->class == &event_class_syscall_enter ||
tp_event->class == &event_class_syscall_exit;
}
#else #else
#define SYSCALL_METADATA(sname, nb, ...) #define SYSCALL_METADATA(sname, nb, ...)
static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
{
return 0;
}
#endif #endif
#define SYSCALL_DEFINE0(sname) \ #define SYSCALL_DEFINE0(sname) \
......
...@@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event) ...@@ -8050,7 +8050,7 @@ static void perf_event_free_bpf_handler(struct perf_event *event)
static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
{ {
bool is_kprobe, is_tracepoint; bool is_kprobe, is_tracepoint, is_syscall_tp;
struct bpf_prog *prog; struct bpf_prog *prog;
if (event->attr.type != PERF_TYPE_TRACEPOINT) if (event->attr.type != PERF_TYPE_TRACEPOINT)
...@@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) ...@@ -8061,7 +8061,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE; is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT; is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
if (!is_kprobe && !is_tracepoint) is_syscall_tp = is_syscall_trace_event(event->tp_event);
if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */ /* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL; return -EINVAL;
...@@ -8070,13 +8071,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) ...@@ -8070,13 +8071,14 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
return PTR_ERR(prog); return PTR_ERR(prog);
if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) || if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) { (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
/* valid fd, but invalid bpf program type */ /* valid fd, but invalid bpf program type */
bpf_prog_put(prog); bpf_prog_put(prog);
return -EINVAL; return -EINVAL;
} }
if (is_tracepoint) { if (is_tracepoint || is_syscall_tp) {
int off = trace_event_get_offsets(event->tp_event); int off = trace_event_get_offsets(event->tp_event);
if (prog->aux->max_ctx_offset > off) { if (prog->aux->max_ctx_offset > off) {
......
...@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); ...@@ -559,11 +559,29 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
static int sys_perf_refcount_enter; static int sys_perf_refcount_enter;
static int sys_perf_refcount_exit; static int sys_perf_refcount_exit;
static int perf_call_bpf_enter(struct bpf_prog *prog, struct pt_regs *regs,
struct syscall_metadata *sys_data,
struct syscall_trace_enter *rec) {
struct syscall_tp_t {
unsigned long long regs;
unsigned long syscall_nr;
unsigned long args[sys_data->nb_args];
} param;
int i;
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
for (i = 0; i < sys_data->nb_args; i++)
param.args[i] = rec->args[i];
return trace_call_bpf(prog, &param);
}
static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
{ {
struct syscall_metadata *sys_data; struct syscall_metadata *sys_data;
struct syscall_trace_enter *rec; struct syscall_trace_enter *rec;
struct hlist_head *head; struct hlist_head *head;
struct bpf_prog *prog;
int syscall_nr; int syscall_nr;
int rctx; int rctx;
int size; int size;
...@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) ...@@ -578,8 +596,9 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!sys_data) if (!sys_data)
return; return;
prog = READ_ONCE(sys_data->enter_event->prog);
head = this_cpu_ptr(sys_data->enter_event->perf_events); head = this_cpu_ptr(sys_data->enter_event->perf_events);
if (hlist_empty(head)) if (!prog && hlist_empty(head))
return; return;
/* get the size after alignment with the u32 buffer size field */ /* get the size after alignment with the u32 buffer size field */
...@@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) ...@@ -594,6 +613,13 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
rec->nr = syscall_nr; rec->nr = syscall_nr;
syscall_get_arguments(current, regs, 0, sys_data->nb_args, syscall_get_arguments(current, regs, 0, sys_data->nb_args,
(unsigned long *)&rec->args); (unsigned long *)&rec->args);
if ((prog && !perf_call_bpf_enter(prog, regs, sys_data, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
perf_trace_buf_submit(rec, size, rctx, perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs, sys_data->enter_event->event.type, 1, regs,
head, NULL); head, NULL);
...@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call) ...@@ -633,11 +659,26 @@ static void perf_sysenter_disable(struct trace_event_call *call)
mutex_unlock(&syscall_trace_lock); mutex_unlock(&syscall_trace_lock);
} }
static int perf_call_bpf_exit(struct bpf_prog *prog, struct pt_regs *regs,
struct syscall_trace_exit *rec) {
struct syscall_tp_t {
unsigned long long regs;
unsigned long syscall_nr;
unsigned long ret;
} param;
*(struct pt_regs **)&param = regs;
param.syscall_nr = rec->nr;
param.ret = rec->ret;
return trace_call_bpf(prog, &param);
}
static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
{ {
struct syscall_metadata *sys_data; struct syscall_metadata *sys_data;
struct syscall_trace_exit *rec; struct syscall_trace_exit *rec;
struct hlist_head *head; struct hlist_head *head;
struct bpf_prog *prog;
int syscall_nr; int syscall_nr;
int rctx; int rctx;
int size; int size;
...@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) ...@@ -652,8 +693,9 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data) if (!sys_data)
return; return;
prog = READ_ONCE(sys_data->exit_event->prog);
head = this_cpu_ptr(sys_data->exit_event->perf_events); head = this_cpu_ptr(sys_data->exit_event->perf_events);
if (hlist_empty(head)) if (!prog && hlist_empty(head))
return; return;
/* We can probably do that at build time */ /* We can probably do that at build time */
...@@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) ...@@ -666,6 +708,13 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr; rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs); rec->ret = syscall_get_return_value(current, regs);
if ((prog && !perf_call_bpf_exit(prog, regs, rec)) ||
hlist_empty(head)) {
perf_swevent_put_recursion_context(rctx);
return;
}
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
1, regs, head, NULL); 1, regs, head, NULL);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment