Commit 7c9903c9 authored by Peter Zijlstra's avatar Peter Zijlstra Committed by Ingo Molnar

x86/perf, static_call: Optimize x86_pmu methods

Replace many of the indirect calls with static_call().

The average PMI time, as measured by perf_sample_event_took()*:

  PRE:    3283.03 [ns]
  POST:   3145.12 [ns]

Which is a ~138 [ns] win per PMI, or a ~4.2% decrease.

[*] on an IVB-EP, using: 'perf record -a -e cycles -- make O=defconfig-build/ -j80'
Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: https://lore.kernel.org/r/20200818135805.338001015@infradead.org
parent d25e37d8
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <linux/bitops.h> #include <linux/bitops.h>
#include <linux/device.h> #include <linux/device.h>
#include <linux/nospec.h> #include <linux/nospec.h>
#include <linux/static_call.h>
#include <asm/apic.h> #include <asm/apic.h>
#include <asm/stacktrace.h> #include <asm/stacktrace.h>
...@@ -52,6 +53,34 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { ...@@ -52,6 +53,34 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key); DEFINE_STATIC_KEY_FALSE(rdpmc_always_available_key);
/*
* This here uses DEFINE_STATIC_CALL_NULL() to get a static_call defined
* from just a typename, as opposed to an actual function.
*/
DEFINE_STATIC_CALL_NULL(x86_pmu_handle_irq, *x86_pmu.handle_irq);
DEFINE_STATIC_CALL_NULL(x86_pmu_disable_all, *x86_pmu.disable_all);
DEFINE_STATIC_CALL_NULL(x86_pmu_enable_all, *x86_pmu.enable_all);
DEFINE_STATIC_CALL_NULL(x86_pmu_enable, *x86_pmu.enable);
DEFINE_STATIC_CALL_NULL(x86_pmu_disable, *x86_pmu.disable);
DEFINE_STATIC_CALL_NULL(x86_pmu_add, *x86_pmu.add);
DEFINE_STATIC_CALL_NULL(x86_pmu_del, *x86_pmu.del);
DEFINE_STATIC_CALL_NULL(x86_pmu_read, *x86_pmu.read);
DEFINE_STATIC_CALL_NULL(x86_pmu_schedule_events, *x86_pmu.schedule_events);
DEFINE_STATIC_CALL_NULL(x86_pmu_get_event_constraints, *x86_pmu.get_event_constraints);
DEFINE_STATIC_CALL_NULL(x86_pmu_put_event_constraints, *x86_pmu.put_event_constraints);
DEFINE_STATIC_CALL_NULL(x86_pmu_start_scheduling, *x86_pmu.start_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_commit_scheduling, *x86_pmu.commit_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_stop_scheduling, *x86_pmu.stop_scheduling);
DEFINE_STATIC_CALL_NULL(x86_pmu_sched_task, *x86_pmu.sched_task);
DEFINE_STATIC_CALL_NULL(x86_pmu_swap_task_ctx, *x86_pmu.swap_task_ctx);
DEFINE_STATIC_CALL_NULL(x86_pmu_drain_pebs, *x86_pmu.drain_pebs);
DEFINE_STATIC_CALL_NULL(x86_pmu_pebs_aliases, *x86_pmu.pebs_aliases);
u64 __read_mostly hw_cache_event_ids u64 __read_mostly hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_OP_MAX]
...@@ -660,7 +689,7 @@ static void x86_pmu_disable(struct pmu *pmu) ...@@ -660,7 +689,7 @@ static void x86_pmu_disable(struct pmu *pmu)
cpuc->enabled = 0; cpuc->enabled = 0;
barrier(); barrier();
x86_pmu.disable_all(); static_call(x86_pmu_disable_all)();
} }
void x86_pmu_enable_all(int added) void x86_pmu_enable_all(int added)
...@@ -907,8 +936,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) ...@@ -907,8 +936,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD) if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
n0 -= cpuc->n_txn; n0 -= cpuc->n_txn;
if (x86_pmu.start_scheduling) static_call_cond(x86_pmu_start_scheduling)(cpuc);
x86_pmu.start_scheduling(cpuc);
for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
c = cpuc->event_constraint[i]; c = cpuc->event_constraint[i];
...@@ -925,7 +953,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) ...@@ -925,7 +953,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* change due to external factors (sibling state, allow_tfa). * change due to external factors (sibling state, allow_tfa).
*/ */
if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) { if (!c || (c->flags & PERF_X86_EVENT_DYNAMIC)) {
c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]); c = static_call(x86_pmu_get_event_constraints)(cpuc, i, cpuc->event_list[i]);
cpuc->event_constraint[i] = c; cpuc->event_constraint[i] = c;
} }
...@@ -1008,8 +1036,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) ...@@ -1008,8 +1036,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
if (!unsched && assign) { if (!unsched && assign) {
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
e = cpuc->event_list[i]; e = cpuc->event_list[i];
if (x86_pmu.commit_scheduling) static_call_cond(x86_pmu_commit_scheduling)(cpuc, i, assign[i]);
x86_pmu.commit_scheduling(cpuc, i, assign[i]);
} }
} else { } else {
for (i = n0; i < n; i++) { for (i = n0; i < n; i++) {
...@@ -1018,15 +1045,13 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) ...@@ -1018,15 +1045,13 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
/* /*
* release events that failed scheduling * release events that failed scheduling
*/ */
if (x86_pmu.put_event_constraints) static_call_cond(x86_pmu_put_event_constraints)(cpuc, e);
x86_pmu.put_event_constraints(cpuc, e);
cpuc->event_constraint[i] = NULL; cpuc->event_constraint[i] = NULL;
} }
} }
if (x86_pmu.stop_scheduling) static_call_cond(x86_pmu_stop_scheduling)(cpuc);
x86_pmu.stop_scheduling(cpuc);
return unsched ? -EINVAL : 0; return unsched ? -EINVAL : 0;
} }
...@@ -1226,7 +1251,7 @@ static void x86_pmu_enable(struct pmu *pmu) ...@@ -1226,7 +1251,7 @@ static void x86_pmu_enable(struct pmu *pmu)
cpuc->enabled = 1; cpuc->enabled = 1;
barrier(); barrier();
x86_pmu.enable_all(added); static_call(x86_pmu_enable_all)(added);
} }
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
...@@ -1347,7 +1372,7 @@ static int x86_pmu_add(struct perf_event *event, int flags) ...@@ -1347,7 +1372,7 @@ static int x86_pmu_add(struct perf_event *event, int flags)
if (cpuc->txn_flags & PERF_PMU_TXN_ADD) if (cpuc->txn_flags & PERF_PMU_TXN_ADD)
goto done_collect; goto done_collect;
ret = x86_pmu.schedule_events(cpuc, n, assign); ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
if (ret) if (ret)
goto out; goto out;
/* /*
...@@ -1365,13 +1390,11 @@ static int x86_pmu_add(struct perf_event *event, int flags) ...@@ -1365,13 +1390,11 @@ static int x86_pmu_add(struct perf_event *event, int flags)
cpuc->n_added += n - n0; cpuc->n_added += n - n0;
cpuc->n_txn += n - n0; cpuc->n_txn += n - n0;
if (x86_pmu.add) {
/* /*
* This is before x86_pmu_enable() will call x86_pmu_start(), * This is before x86_pmu_enable() will call x86_pmu_start(),
* so we enable LBRs before an event needs them etc.. * so we enable LBRs before an event needs them etc..
*/ */
x86_pmu.add(event); static_call_cond(x86_pmu_add)(event);
}
ret = 0; ret = 0;
out: out:
...@@ -1399,7 +1422,7 @@ static void x86_pmu_start(struct perf_event *event, int flags) ...@@ -1399,7 +1422,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
cpuc->events[idx] = event; cpuc->events[idx] = event;
__set_bit(idx, cpuc->active_mask); __set_bit(idx, cpuc->active_mask);
__set_bit(idx, cpuc->running); __set_bit(idx, cpuc->running);
x86_pmu.enable(event); static_call(x86_pmu_enable)(event);
perf_event_update_userpage(event); perf_event_update_userpage(event);
} }
...@@ -1469,7 +1492,7 @@ void x86_pmu_stop(struct perf_event *event, int flags) ...@@ -1469,7 +1492,7 @@ void x86_pmu_stop(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw; struct hw_perf_event *hwc = &event->hw;
if (test_bit(hwc->idx, cpuc->active_mask)) { if (test_bit(hwc->idx, cpuc->active_mask)) {
x86_pmu.disable(event); static_call(x86_pmu_disable)(event);
__clear_bit(hwc->idx, cpuc->active_mask); __clear_bit(hwc->idx, cpuc->active_mask);
cpuc->events[hwc->idx] = NULL; cpuc->events[hwc->idx] = NULL;
WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED); WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
...@@ -1519,8 +1542,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) ...@@ -1519,8 +1542,7 @@ static void x86_pmu_del(struct perf_event *event, int flags)
if (i >= cpuc->n_events - cpuc->n_added) if (i >= cpuc->n_events - cpuc->n_added)
--cpuc->n_added; --cpuc->n_added;
if (x86_pmu.put_event_constraints) static_call_cond(x86_pmu_put_event_constraints)(cpuc, event);
x86_pmu.put_event_constraints(cpuc, event);
/* Delete the array entry. */ /* Delete the array entry. */
while (++i < cpuc->n_events) { while (++i < cpuc->n_events) {
...@@ -1533,13 +1555,12 @@ static void x86_pmu_del(struct perf_event *event, int flags) ...@@ -1533,13 +1555,12 @@ static void x86_pmu_del(struct perf_event *event, int flags)
perf_event_update_userpage(event); perf_event_update_userpage(event);
do_del: do_del:
if (x86_pmu.del) {
/* /*
* This is after x86_pmu_stop(); so we disable LBRs after any * This is after x86_pmu_stop(); so we disable LBRs after any
* event can need them etc.. * event can need them etc..
*/ */
x86_pmu.del(event); static_call_cond(x86_pmu_del)(event);
}
} }
int x86_pmu_handle_irq(struct pt_regs *regs) int x86_pmu_handle_irq(struct pt_regs *regs)
...@@ -1617,7 +1638,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs) ...@@ -1617,7 +1638,7 @@ perf_event_nmi_handler(unsigned int cmd, struct pt_regs *regs)
return NMI_DONE; return NMI_DONE;
start_clock = sched_clock(); start_clock = sched_clock();
ret = x86_pmu.handle_irq(regs); ret = static_call(x86_pmu_handle_irq)(regs);
finish_clock = sched_clock(); finish_clock = sched_clock();
perf_sample_event_took(finish_clock - start_clock); perf_sample_event_took(finish_clock - start_clock);
...@@ -1830,6 +1851,38 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event) ...@@ -1830,6 +1851,38 @@ ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event)
static struct attribute_group x86_pmu_attr_group; static struct attribute_group x86_pmu_attr_group;
static struct attribute_group x86_pmu_caps_group; static struct attribute_group x86_pmu_caps_group;
static void x86_pmu_static_call_update(void)
{
static_call_update(x86_pmu_handle_irq, x86_pmu.handle_irq);
static_call_update(x86_pmu_disable_all, x86_pmu.disable_all);
static_call_update(x86_pmu_enable_all, x86_pmu.enable_all);
static_call_update(x86_pmu_enable, x86_pmu.enable);
static_call_update(x86_pmu_disable, x86_pmu.disable);
static_call_update(x86_pmu_add, x86_pmu.add);
static_call_update(x86_pmu_del, x86_pmu.del);
static_call_update(x86_pmu_read, x86_pmu.read);
static_call_update(x86_pmu_schedule_events, x86_pmu.schedule_events);
static_call_update(x86_pmu_get_event_constraints, x86_pmu.get_event_constraints);
static_call_update(x86_pmu_put_event_constraints, x86_pmu.put_event_constraints);
static_call_update(x86_pmu_start_scheduling, x86_pmu.start_scheduling);
static_call_update(x86_pmu_commit_scheduling, x86_pmu.commit_scheduling);
static_call_update(x86_pmu_stop_scheduling, x86_pmu.stop_scheduling);
static_call_update(x86_pmu_sched_task, x86_pmu.sched_task);
static_call_update(x86_pmu_swap_task_ctx, x86_pmu.swap_task_ctx);
static_call_update(x86_pmu_drain_pebs, x86_pmu.drain_pebs);
static_call_update(x86_pmu_pebs_aliases, x86_pmu.pebs_aliases);
}
static void _x86_pmu_read(struct perf_event *event)
{
x86_perf_event_update(event);
}
static int __init init_hw_perf_events(void) static int __init init_hw_perf_events(void)
{ {
struct x86_pmu_quirk *quirk; struct x86_pmu_quirk *quirk;
...@@ -1898,6 +1951,11 @@ static int __init init_hw_perf_events(void) ...@@ -1898,6 +1951,11 @@ static int __init init_hw_perf_events(void)
pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
if (!x86_pmu.read)
x86_pmu.read = _x86_pmu_read;
x86_pmu_static_call_update();
/* /*
* Install callbacks. Core will call them for each online * Install callbacks. Core will call them for each online
* cpu. * cpu.
...@@ -1934,11 +1992,9 @@ static int __init init_hw_perf_events(void) ...@@ -1934,11 +1992,9 @@ static int __init init_hw_perf_events(void)
} }
early_initcall(init_hw_perf_events); early_initcall(init_hw_perf_events);
static inline void x86_pmu_read(struct perf_event *event) static void x86_pmu_read(struct perf_event *event)
{ {
if (x86_pmu.read) static_call(x86_pmu_read)(event);
return x86_pmu.read(event);
x86_perf_event_update(event);
} }
/* /*
...@@ -2015,7 +2071,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu) ...@@ -2015,7 +2071,7 @@ static int x86_pmu_commit_txn(struct pmu *pmu)
if (!x86_pmu_initialized()) if (!x86_pmu_initialized())
return -EAGAIN; return -EAGAIN;
ret = x86_pmu.schedule_events(cpuc, n, assign); ret = static_call(x86_pmu_schedule_events)(cpuc, n, assign);
if (ret) if (ret)
return ret; return ret;
...@@ -2308,15 +2364,13 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { ...@@ -2308,15 +2364,13 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
{ {
if (x86_pmu.sched_task) static_call_cond(x86_pmu_sched_task)(ctx, sched_in);
x86_pmu.sched_task(ctx, sched_in);
} }
static void x86_pmu_swap_task_ctx(struct perf_event_context *prev, static void x86_pmu_swap_task_ctx(struct perf_event_context *prev,
struct perf_event_context *next) struct perf_event_context *next)
{ {
if (x86_pmu.swap_task_ctx) static_call_cond(x86_pmu_swap_task_ctx)(prev, next);
x86_pmu.swap_task_ctx(prev, next);
} }
void perf_check_microcode(void) void perf_check_microcode(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment