Commit 06d90d3d authored by Sasha Goldshtein's avatar Sasha Goldshtein

cpudist: Use `finish_task_switch` kprobe instead of `sched_switch` tracepoint

The `sched_switch` tracepoint approach requires storing the previous
task's tgid in a map and fetching it from there, because it is not
available as a tracepoint argument. Instead, placing a kprobe on the
`finish_task_switch` function allows cleanly fetching the previous
task's pid and tgid from the task_struct.
parent 3c976bbd
...@@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram, ...@@ -19,11 +19,6 @@ This tool uses in-kernel eBPF maps for storing timestamps and the histogram,
for efficiency. Despite this, the overhead of this tool may become significant for efficiency. Despite this, the overhead of this tool may become significant
for some workloads: see the OVERHEAD section. for some workloads: see the OVERHEAD section.
This tool uses the sched:sched_switch kernel tracepoint to determine when a
task is scheduled and descheduled. If the tracepoint arguments change in the
future, this tool will have to be updated. Still, it is more reliable than
using kprobes on the respective kernel functions directly.
Since this uses BPF, only the root user can use this tool. Since this uses BPF, only the root user can use this tool.
.SH REQUIREMENTS .SH REQUIREMENTS
CONFIG_BPF and bcc. CONFIG_BPF and bcc.
......
...@@ -48,12 +48,9 @@ args = parser.parse_args() ...@@ -48,12 +48,9 @@ args = parser.parse_args()
countdown = int(args.count) countdown = int(args.count)
debug = 0 debug = 0
tp = Tracepoint.enable_tracepoint("sched", "sched_switch") bpf_text = """#include <uapi/linux/ptrace.h>
bpf_text = "#include <uapi/linux/ptrace.h>\n" #include <linux/sched.h>
bpf_text += "#include <linux/sched.h>\n" """
bpf_text += tp.generate_decl()
bpf_text += tp.generate_entry_probe()
bpf_text += tp.generate_struct()
if not args.offcpu: if not args.offcpu:
bpf_text += "#define ONCPU\n" bpf_text += "#define ONCPU\n"
...@@ -66,17 +63,8 @@ typedef struct pid_key { ...@@ -66,17 +63,8 @@ typedef struct pid_key {
BPF_HASH(start, u32, u64); BPF_HASH(start, u32, u64);
BPF_HASH(tgid_for_pid, u32, u32);
STORAGE STORAGE
static inline u32 get_tgid(u32 pid)
{
u32 *stored_tgid = tgid_for_pid.lookup(&pid);
if (stored_tgid != 0)
return *stored_tgid;
return 0xffffffff;
}
static inline void store_start(u32 tgid, u32 pid, u64 ts) static inline void store_start(u32 tgid, u32 pid, u64 ts)
{ {
if (FILTER) if (FILTER)
...@@ -99,32 +87,19 @@ static inline void update_hist(u32 tgid, u32 pid, u64 ts) ...@@ -99,32 +87,19 @@ static inline void update_hist(u32 tgid, u32 pid, u64 ts)
STORE STORE
} }
int sched_switch(struct pt_regs *ctx) int sched_switch(struct pt_regs *ctx, struct task_struct *prev)
{ {
u64 ts = bpf_ktime_get_ns(); u64 ts = bpf_ktime_get_ns();
u64 pid_tgid = bpf_get_current_pid_tgid(); u64 pid_tgid = bpf_get_current_pid_tgid();
u32 tgid = pid_tgid >> 32, pid = pid_tgid; u32 tgid = pid_tgid >> 32, pid = pid_tgid;
// Keep a mapping of tgid for pid because when sched_switch hits,
// we only have the tgid information for the *current* pid, but not
// for the previous one.
tgid_for_pid.update(&pid, &tgid);
u64 *di = __trace_di.lookup(&pid_tgid);
if (di == 0)
return 0;
struct sched_switch_trace_entry args = {};
bpf_probe_read(&args, sizeof(args), (void *)*di);
#ifdef ONCPU #ifdef ONCPU
if (args.prev_state == TASK_RUNNING) { if (prev->state == TASK_RUNNING) {
#else #else
if (1) { if (1) {
#endif #endif
u32 prev_pid = args.prev_pid; u32 prev_pid = prev->pid;
u32 prev_tgid = get_tgid(prev_pid); u32 prev_tgid = prev->tgid;
if (prev_tgid == 0xffffffff)
goto BAIL;
#ifdef ONCPU #ifdef ONCPU
update_hist(prev_tgid, prev_pid, ts); update_hist(prev_tgid, prev_pid, ts);
#else #else
...@@ -173,8 +148,7 @@ if debug: ...@@ -173,8 +148,7 @@ if debug:
print(bpf_text) print(bpf_text)
b = BPF(text=bpf_text) b = BPF(text=bpf_text)
Tracepoint.attach(b) b.attach_kprobe(event="finish_task_switch", fn_name="sched_switch")
b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch")
print("Tracing %s-CPU time... Hit Ctrl-C to end." % print("Tracing %s-CPU time... Hit Ctrl-C to end." %
("off" if args.offcpu else "on")) ("off" if args.offcpu else "on"))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment