Commit c7206205 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

perf: Fix mmap_page capabilities and docs

Complete the syscall-less self-profiling feature and address
all complaints, namely:

 - capabilities, so we can detect what is actually available at runtime

     Add a capabilities field to perf_event_mmap_page to indicate
     what is actually available for use.

 - on x86: RDPMC weirdness due to being 40/48 bits and not sign-extending
   properly.

 - ABI documentation as to how all this stuff works.

Also improve the documentation for the new features.
Signed-off-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@google.com>
Cc: Vince Weaver <vweaver1@eecs.utk.edu>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1332433596.2487.33.camel@twinsSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent c5bc4377
...@@ -1622,6 +1622,9 @@ static int x86_pmu_event_idx(struct perf_event *event) ...@@ -1622,6 +1622,9 @@ static int x86_pmu_event_idx(struct perf_event *event)
{ {
int idx = event->hw.idx; int idx = event->hw.idx;
if (!x86_pmu.attr_rdpmc)
return 0;
if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) { if (x86_pmu.num_counters_fixed && idx >= X86_PMC_IDX_FIXED) {
idx -= X86_PMC_IDX_FIXED; idx -= X86_PMC_IDX_FIXED;
idx |= 1 << 30; idx |= 1 << 30;
...@@ -1706,14 +1709,19 @@ static struct pmu pmu = { ...@@ -1706,14 +1709,19 @@ static struct pmu pmu = {
.flush_branch_stack = x86_pmu_flush_branch_stack, .flush_branch_stack = x86_pmu_flush_branch_stack,
}; };
void perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{ {
userpg->cap_usr_time = 0;
userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc;
userpg->pmc_width = x86_pmu.cntval_bits;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
return; return;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
return; return;
userpg->cap_usr_time = 1;
userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_mult = this_cpu_read(cyc2ns);
userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_shift = CYC2NS_SCALE_FACTOR;
userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now;
......
...@@ -299,18 +299,31 @@ struct perf_event_mmap_page { ...@@ -299,18 +299,31 @@ struct perf_event_mmap_page {
/* /*
* Bits needed to read the hw events in user-space. * Bits needed to read the hw events in user-space.
* *
* u32 seq; * u32 seq, time_mult, time_shift, idx, width;
* s64 count; * u64 count, enabled, running;
* u64 cyc, time_offset;
* s64 pmc = 0;
* *
* do { * do {
* seq = pc->lock; * seq = pc->lock;
*
* barrier() * barrier()
* if (pc->index) { *
* count = pmc_read(pc->index - 1); * enabled = pc->time_enabled;
* count += pc->offset; * running = pc->time_running;
* } else *
* goto regular_read; * if (pc->cap_usr_time && enabled != running) {
* cyc = rdtsc();
* time_offset = pc->time_offset;
* time_mult = pc->time_mult;
* time_shift = pc->time_shift;
* }
*
* idx = pc->index;
* count = pc->offset;
* if (pc->cap_usr_rdpmc && idx) {
* width = pc->pmc_width;
* pmc = rdpmc(idx - 1);
* }
* *
* barrier(); * barrier();
* } while (pc->lock != seq); * } while (pc->lock != seq);
...@@ -323,14 +336,57 @@ struct perf_event_mmap_page { ...@@ -323,14 +336,57 @@ struct perf_event_mmap_page {
__s64 offset; /* add to hardware event value */ __s64 offset; /* add to hardware event value */
__u64 time_enabled; /* time event active */ __u64 time_enabled; /* time event active */
__u64 time_running; /* time event on cpu */ __u64 time_running; /* time event on cpu */
__u32 time_mult, time_shift; union {
__u64 capabilities;
__u64 cap_usr_time : 1,
cap_usr_rdpmc : 1,
cap_____res : 62;
};
/*
* If cap_usr_rdpmc this field provides the bit-width of the value
* read using the rdpmc() or equivalent instruction. This can be used
* to sign extend the result like:
*
* pmc <<= 64 - width;
* pmc >>= 64 - width; // signed shift right
* count += pmc;
*/
__u16 pmc_width;
/*
* If cap_usr_time the below fields can be used to compute the time
* delta since time_enabled (in ns) using rdtsc or similar.
*
* u64 quot, rem;
* u64 delta;
*
* quot = (cyc >> time_shift);
* rem = cyc & ((1 << time_shift) - 1);
* delta = time_offset + quot * time_mult +
* ((rem * time_mult) >> time_shift);
*
* Where time_offset,time_mult,time_shift and cyc are read in the
* seqcount loop described above. This delta can then be added to
* enabled and possible running (if idx), improving the scaling:
*
* enabled += delta;
* if (idx)
* running += delta;
*
* quot = count / running;
* rem = count % running;
* count = quot * enabled + (rem * enabled) / running;
*/
__u16 time_shift;
__u32 time_mult;
__u64 time_offset; __u64 time_offset;
/* /*
* Hole for extension of the self monitor capabilities * Hole for extension of the self monitor capabilities
*/ */
__u64 __reserved[121]; /* align to 1k */ __u64 __reserved[120]; /* align to 1k */
/* /*
* Control data for the mmap() data buffer. * Control data for the mmap() data buffer.
...@@ -347,6 +403,13 @@ struct perf_event_mmap_page { ...@@ -347,6 +403,13 @@ struct perf_event_mmap_page {
__u64 data_tail; /* user-space written tail */ __u64 data_tail; /* user-space written tail */
}; };
/*
* Build time assertion that we keep the data_head at the intended location.
* IOW, validation we got the __reserved[] size right.
*/
extern char __assert_mmap_data_head_offset
[1 - 2*!!(offsetof(struct perf_event_mmap_page, data_head) != 1024)];
#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0)
#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0)
#define PERF_RECORD_MISC_KERNEL (1 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0)
......
...@@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event, ...@@ -3348,7 +3348,7 @@ static void calc_timer_values(struct perf_event *event,
*running = ctx_time - event->tstamp_running; *running = ctx_time - event->tstamp_running;
} }
void __weak perf_update_user_clock(struct perf_event_mmap_page *userpg, u64 now) void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
{ {
} }
...@@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event) ...@@ -3398,7 +3398,7 @@ void perf_event_update_userpage(struct perf_event *event)
userpg->time_running = running + userpg->time_running = running +
atomic64_read(&event->child_total_time_running); atomic64_read(&event->child_total_time_running);
perf_update_user_clock(userpg, now); arch_perf_update_userpage(userpg, now);
barrier(); barrier();
++userpg->lock; ++userpg->lock;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment