perf_event.c 42.7 KB
Newer Older
1
/*
2
 * Performance events x86 architecture code
3
 *
4 5 6 7 8
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/highmem.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
31
#include <asm/nmi.h>
32
#include <asm/compat.h>
33
#include <asm/smp.h>
34
#include <asm/alternative.h>
35

36 37 38 39 40 41 42 43 44 45 46
#if 0
#undef wrmsrl
#define wrmsrl(msr, val) 					\
do {								\
	trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
			(unsigned long)(val));			\
	native_write_msr((msr), (u32)((u64)(val)), 		\
			(u32)((u64)(val) >> 32));		\
} while (0)
#endif

47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
/*
 * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
 */
static unsigned long
copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
{
	unsigned long offset, addr = (unsigned long)from;
	unsigned long size, len = 0;
	struct page *page;
	void *map;
	int ret;

	do {
		ret = __get_user_pages_fast(addr, 1, 0, &page);
		if (!ret)
			break;

		offset = addr & (PAGE_SIZE - 1);
		size = min(PAGE_SIZE - offset, n - len);

67
		map = kmap_atomic(page);
68
		memcpy(to, map+offset, size);
69
		kunmap_atomic(map);
70 71 72 73 74 75 76 77 78 79 80
		put_page(page);

		len  += size;
		to   += size;
		addr += size;

	} while (len < n);

	return len;
}

81
struct event_constraint {
82 83
	union {
		unsigned long	idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
84
		u64		idxmsk64;
85
	};
86 87
	u64	code;
	u64	cmask;
88
	int	weight;
89 90
};

91 92 93 94 95 96 97
struct amd_nb {
	int nb_id;  /* NorthBridge id */
	int refcnt; /* reference count */
	struct perf_event *owners[X86_PMC_IDX_MAX];
	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};

98 99
struct intel_percore;

100 101
#define MAX_LBR_ENTRIES		16

102
struct cpu_hw_events {
103 104 105
	/*
	 * Generic x86 PMC bits
	 */
106
	struct perf_event	*events[X86_PMC_IDX_MAX]; /* in counter order */
107
	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
108
	unsigned long		running[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
109
	int			enabled;
110

111 112
	int			n_events;
	int			n_added;
113
	int			n_txn;
114
	int			assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
115
	u64			tags[X86_PMC_IDX_MAX];
116
	struct perf_event	*event_list[X86_PMC_IDX_MAX]; /* in enabled order */
117

118 119
	unsigned int		group_flag;

120 121 122 123 124 125
	/*
	 * Intel DebugStore bits
	 */
	struct debug_store	*ds;
	u64			pebs_enabled;

126 127 128 129 130 131 132 133
	/*
	 * Intel LBR bits
	 */
	int				lbr_users;
	void				*lbr_context;
	struct perf_branch_stack	lbr_stack;
	struct perf_branch_entry	lbr_entries[MAX_LBR_ENTRIES];

134 135 136 137 138 139 140
	/*
	 * Intel percore register state.
	 * Coordinate shared resources between HT threads.
	 */
	int				percore_used; /* Used by this CPU? */
	struct intel_percore		*per_core;

141 142 143
	/*
	 * AMD specific bits
	 */
144
	struct amd_nb		*amd_nb;
145 146
};

147
#define __EVENT_CONSTRAINT(c, n, m, w) {\
148
	{ .idxmsk64 = (n) },		\
149 150
	.code = (c),			\
	.cmask = (m),			\
151
	.weight = (w),			\
152
}
153

154 155 156
#define EVENT_CONSTRAINT(c, n, m)	\
	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))

157 158 159
/*
 * Constraint on the Event code.
 */
160
#define INTEL_EVENT_CONSTRAINT(c, n)	\
161
	EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
162

163 164
/*
 * Constraint on the Event code + UMask + fixed-mask
165 166 167 168 169 170 171 172
 *
 * filter mask to validate fixed counter events.
 * the following filters disqualify for fixed counters:
 *  - inv
 *  - edge
 *  - cnt-mask
 *  The other filters are supported by fixed counters.
 *  The any-thread option is supported starting with v3.
173
 */
174
#define FIXED_EVENT_CONSTRAINT(c, n)	\
175
	EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
176

177 178 179
/*
 * Constraint on the Event code + UMask
 */
180
#define INTEL_UEVENT_CONSTRAINT(c, n)	\
181 182
	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)

183 184 185 186
#define EVENT_CONSTRAINT_END		\
	EVENT_CONSTRAINT(0, 0, 0)

#define for_each_event_constraint(e, c)	\
187
	for ((e) = (c); (e)->weight; (e)++)
188

189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
/*
 * Extra registers for specific events.
 * Some events need large masks and require external MSRs.
 * Define a mapping to these extra registers.
 */
struct extra_reg {
	unsigned int		event;
	unsigned int		msr;
	u64			config_mask;
	u64			valid_mask;
};

#define EVENT_EXTRA_REG(e, ms, m, vm) {	\
	.event = (e),		\
	.msr = (ms),		\
	.config_mask = (m),	\
	.valid_mask = (vm),	\
	}
#define INTEL_EVENT_EXTRA_REG(event, msr, vm)	\
	EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)

211 212 213 214 215 216 217 218 219 220 221
union perf_capabilities {
	struct {
		u64	lbr_format    : 6;
		u64	pebs_trap     : 1;
		u64	pebs_arch_reg : 1;
		u64	pebs_format   : 4;
		u64	smm_freeze    : 1;
	};
	u64	capabilities;
};

222
/*
223
 * struct x86_pmu - generic x86 pmu
224
 */
225
struct x86_pmu {
226 227 228
	/*
	 * Generic x86 PMC bits
	 */
229 230
	const char	*name;
	int		version;
231
	int		(*handle_irq)(struct pt_regs *);
232
	void		(*disable_all)(void);
233
	void		(*enable_all)(int added);
234 235
	void		(*enable)(struct perf_event *);
	void		(*disable)(struct perf_event *);
236
	int		(*hw_config)(struct perf_event *event);
237
	int		(*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
238 239
	unsigned	eventsel;
	unsigned	perfctr;
240
	u64		(*event_map)(int);
241
	int		max_events;
242 243 244 245
	int		num_counters;
	int		num_counters_fixed;
	int		cntval_bits;
	u64		cntval_mask;
246
	int		apic;
247
	u64		max_period;
248 249 250 251
	struct event_constraint *
			(*get_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);

252 253
	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
						 struct perf_event *event);
254
	struct event_constraint *event_constraints;
255
	struct event_constraint *percore_constraints;
256
	void		(*quirks)(void);
257
	int		perfctr_second_write;
258

259
	int		(*cpu_prepare)(int cpu);
260 261 262
	void		(*cpu_starting)(int cpu);
	void		(*cpu_dying)(int cpu);
	void		(*cpu_dead)(int cpu);
263 264 265 266

	/*
	 * Intel Arch Perfmon v2+
	 */
267 268
	u64			intel_ctrl;
	union perf_capabilities intel_cap;
269 270 271 272 273

	/*
	 * Intel DebugStore bits
	 */
	int		bts, pebs;
274
	int		bts_active, pebs_active;
275 276 277
	int		pebs_record_size;
	void		(*drain_pebs)(struct pt_regs *regs);
	struct event_constraint *pebs_constraints;
278 279 280 281 282 283

	/*
	 * Intel LBR
	 */
	unsigned long	lbr_tos, lbr_from, lbr_to; /* MSR base regs       */
	int		lbr_nr;			   /* hardware stack size */
284 285 286 287 288

	/*
	 * Extra registers for events
	 */
	struct extra_reg *extra_regs;
289 290
};

291
static struct x86_pmu x86_pmu __read_mostly;
292

293
static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
294 295
	.enabled = 1,
};
296

297
static int x86_perf_event_set_period(struct perf_event *event);
298

299
/*
300
 * Generalized hw caching related hw_event table, filled
301
 * in on a per model basis. A value of 0 means
302 303
 * 'not supported', -1 means 'hw_event makes no sense on
 * this CPU', any other value means the raw hw_event
304 305 306 307 308 309 310 311 312
 * ID.
 */

#define C(x) PERF_COUNT_HW_CACHE_##x

static u64 __read_mostly hw_cache_event_ids
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
313 314 315 316
static u64 __read_mostly hw_cache_extra_regs
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
317

318
/*
319 320
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
321 322
 * Returns the delta events processed.
 */
323
static u64
324
x86_perf_event_update(struct perf_event *event)
325
{
326
	struct hw_perf_event *hwc = &event->hw;
327
	int shift = 64 - x86_pmu.cntval_bits;
328
	u64 prev_raw_count, new_raw_count;
329
	int idx = hwc->idx;
330
	s64 delta;
331

332 333 334
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

335
	/*
336
	 * Careful: an NMI might modify the previous event value.
337 338 339
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
340
	 * count to the generic event atomically:
341 342
	 */
again:
343
	prev_raw_count = local64_read(&hwc->prev_count);
344
	rdmsrl(hwc->event_base, new_raw_count);
345

346
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
347 348 349 350 351 352
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
353
	 * (event-)time and add that to the generic event.
354 355
	 *
	 * Careful, not all hw sign-extends above the physical width
356
	 * of the count.
357
	 */
358 359
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
360

361 362
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
363 364

	return new_raw_count;
365 366
}

367 368
static inline int x86_pmu_addr_offset(int index)
{
369 370 371 372 373 374 375 376 377 378
	int offset;

	/* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
	alternative_io(ASM_NOP2,
		       "shll $1, %%eax",
		       X86_FEATURE_PERFCTR_CORE,
		       "=a" (offset),
		       "a"  (index));

	return offset;
379 380
}

381 382
static inline unsigned int x86_pmu_config_addr(int index)
{
383
	return x86_pmu.eventsel + x86_pmu_addr_offset(index);
384 385 386 387
}

static inline unsigned int x86_pmu_event_addr(int index)
{
388
	return x86_pmu.perfctr + x86_pmu_addr_offset(index);
389 390
}

391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
	struct extra_reg *er;

	event->hw.extra_reg = 0;
	event->hw.extra_config = 0;

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
		event->hw.extra_reg = er->msr;
		event->hw.extra_config = event->attr.config1;
		break;
	}
	return 0;
}

416
static atomic_t active_events;
417 418
static DEFINE_MUTEX(pmc_reserve_mutex);

419 420
#ifdef CONFIG_X86_LOCAL_APIC

421 422 423 424
static bool reserve_pmc_hardware(void)
{
	int i;

425
	for (i = 0; i < x86_pmu.num_counters; i++) {
426
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
427 428 429
			goto perfctr_fail;
	}

430
	for (i = 0; i < x86_pmu.num_counters; i++) {
431
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
432 433 434 435 436 437 438
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
439
		release_evntsel_nmi(x86_pmu_config_addr(i));
440

441
	i = x86_pmu.num_counters;
442 443 444

perfctr_fail:
	for (i--; i >= 0; i--)
445
		release_perfctr_nmi(x86_pmu_event_addr(i));
446 447 448 449 450 451 452 453

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

454
	for (i = 0; i < x86_pmu.num_counters; i++) {
455 456
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
457 458 459
	}
}

460 461 462 463 464 465 466
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

467 468 469
static bool check_hw_exists(void)
{
	u64 val, val_new = 0;
470
	int i, reg, ret = 0;
471

472 473 474 475 476
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
477
		reg = x86_pmu_config_addr(i);
478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
			goto bios_fail;
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
			if (val & (0x03 << i*4))
				goto bios_fail;
		}
	}

	/*
	 * Now write a value and read it back to see if it matches,
	 * this is needed to detect certain hardware emulators (qemu/kvm)
	 * that don't trap on the MSR access and always return 0s.
	 */
501
	val = 0xabcdUL;
502 503
	ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
	ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
504
	if (ret || val != val_new)
505
		goto msr_fail;
506 507

	return true;
508 509

bios_fail:
510 511 512 513
	/*
	 * We still allow the PMU driver to operate:
	 */
	printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
514
	printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
515 516

	return true;
517 518 519

msr_fail:
	printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
520

521
	return false;
522 523
}

524
static void reserve_ds_buffers(void);
525
static void release_ds_buffers(void);
526

527
static void hw_perf_event_destroy(struct perf_event *event)
528
{
529
	if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
530
		release_pmc_hardware();
531
		release_ds_buffers();
532 533 534 535
		mutex_unlock(&pmc_reserve_mutex);
	}
}

536 537 538 539 540
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

541
static inline int
542
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
543
{
544
	struct perf_event_attr *attr = &event->attr;
545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
571 572
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
573 574
}

575 576 577 578 579 580
static int x86_setup_perfctr(struct perf_event *event)
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

581
	if (!is_sampling_event(event)) {
582 583
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
584
		local64_set(&hwc->period_left, hwc->sample_period);
585 586 587 588 589 590 591 592 593 594 595
	} else {
		/*
		 * If we have a PMU initialized but no APIC
		 * interrupts, we cannot sample hardware
		 * events (user-space has to fall back and
		 * sample via a hrtimer based software event):
		 */
		if (!x86_pmu.apic)
			return -EOPNOTSUPP;
	}

596 597 598 599
	/*
	 * Do not allow config1 (extended registers) to propagate,
	 * there's no sane user-space generalization yet:
	 */
600
	if (attr->type == PERF_TYPE_RAW)
601
		return 0;
602 603

	if (attr->type == PERF_TYPE_HW_CACHE)
604
		return set_ext_hw_attr(hwc, event);
605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
623 624
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
625
		/* BTS is not supported by this architecture. */
626
		if (!x86_pmu.bts_active)
627 628 629 630 631 632 633 634 635 636 637
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
	}

	hwc->config |= config;

	return 0;
}
638

639
static int x86_pmu_hw_config(struct perf_event *event)
640
{
641 642 643 644
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
645
		if (x86_pmu.pebs_active) {
646 647
			precise++;

648 649 650 651
			/* Support for IP fixup */
			if (x86_pmu.lbr_nr)
				precise++;
		}
652 653 654 655 656

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
	}

657 658 659 660
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
661
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
662 663 664 665

	/*
	 * Count user and OS events unless requested not to
	 */
666 667 668 669
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
670

671 672
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
673

674
	return x86_setup_perfctr(event);
675 676
}

677
/*
678
 * Setup the hardware configuration for a given attr_type
679
 */
680
static int __x86_pmu_event_init(struct perf_event *event)
681
{
682
	int err;
683

684 685
	if (!x86_pmu_initialized())
		return -ENODEV;
686

687
	err = 0;
688
	if (!atomic_inc_not_zero(&active_events)) {
689
		mutex_lock(&pmc_reserve_mutex);
690
		if (atomic_read(&active_events) == 0) {
691 692
			if (!reserve_pmc_hardware())
				err = -EBUSY;
693 694
			else
				reserve_ds_buffers();
695 696
		}
		if (!err)
697
			atomic_inc(&active_events);
698 699 700 701 702
		mutex_unlock(&pmc_reserve_mutex);
	}
	if (err)
		return err;

703
	event->destroy = hw_perf_event_destroy;
704

705 706 707
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
708

709
	return x86_pmu.hw_config(event);
710 711
}

712
static void x86_pmu_disable_all(void)
713
{
714
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
715 716
	int idx;

717
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
718 719
		u64 val;

720
		if (!test_bit(idx, cpuc->active_mask))
721
			continue;
722
		rdmsrl(x86_pmu_config_addr(idx), val);
723
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
724
			continue;
725
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
726
		wrmsrl(x86_pmu_config_addr(idx), val);
727 728 729
	}
}

730
static void x86_pmu_disable(struct pmu *pmu)
731
{
732 733
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);

734
	if (!x86_pmu_initialized())
735
		return;
736

737 738 739 740 741 742
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
743 744

	x86_pmu.disable_all();
745
}
746

747 748 749
static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
					  u64 enable_mask)
{
750 751
	if (hwc->extra_reg)
		wrmsrl(hwc->extra_reg, hwc->extra_config);
752
	wrmsrl(hwc->config_base, hwc->config | enable_mask);
753 754
}

755
static void x86_pmu_enable_all(int added)
756
{
757
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
758 759
	int idx;

760
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
761
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
762

763
		if (!test_bit(idx, cpuc->active_mask))
764
			continue;
765

766
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
767 768 769
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
770
static struct pmu pmu;
771 772 773 774 775 776 777 778

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
{
779
	struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
780
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
781
	int i, j, w, wmax, num = 0;
782 783 784 785 786
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

	for (i = 0; i < n; i++) {
787 788
		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
		constraints[i] = c;
789 790
	}

791 792 793
	/*
	 * fastpath, try to reuse previous register
	 */
794
	for (i = 0; i < n; i++) {
795
		hwc = &cpuc->event_list[i]->hw;
796
		c = constraints[i];
797 798 799 800 801 802

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
803
		if (!test_bit(hwc->idx, c->idxmsk))
804 805 806 807 808 809
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

810
		__set_bit(hwc->idx, used_mask);
811 812 813
		if (assign)
			assign[i] = hwc->idx;
	}
814
	if (i == n)
815 816 817 818 819 820 821 822
		goto done;

	/*
	 * begin slow path
	 */

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

823 824 825 826 827 828 829 830 831
	/*
	 * weight = number of possible counters
	 *
	 * 1    = most constrained, only works on one counter
	 * wmax = least constrained, works on any counter
	 *
	 * assign events to counters starting with most
	 * constrained events.
	 */
832
	wmax = x86_pmu.num_counters;
833 834 835 836 837 838

	/*
	 * when fixed event counters are present,
	 * wmax is incremented by 1 to account
	 * for one more choice
	 */
839
	if (x86_pmu.num_counters_fixed)
840 841
		wmax++;

842
	for (w = 1, num = n; num && w <= wmax; w++) {
843
		/* for each event */
844
		for (i = 0; num && i < n; i++) {
845
			c = constraints[i];
846 847
			hwc = &cpuc->event_list[i]->hw;

848
			if (c->weight != w)
849 850
				continue;

851
			for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
852 853 854 855 856 857 858
				if (!test_bit(j, used_mask))
					break;
			}

			if (j == X86_PMC_IDX_MAX)
				break;

859
			__set_bit(j, used_mask);
860

861 862 863 864 865
			if (assign)
				assign[i] = j;
			num--;
		}
	}
866
done:
867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
	/*
	 * scheduling failed or is just a simulation,
	 * free resources if necessary
	 */
	if (!assign || num) {
		for (i = 0; i < n; i++) {
			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
		}
	}
	return num ? -ENOSPC : 0;
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

889
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
890 891 892 893 894 895 896 897 898 899 900 901 902 903 904

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
			return -ENOSPC;
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
905
		    event->state <= PERF_EVENT_STATE_OFF)
906 907 908 909 910 911 912 913 914 915 916 917
			continue;

		if (n >= max_count)
			return -ENOSPC;

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
918
				struct cpu_hw_events *cpuc, int i)
919
{
920 921 922 923 924
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
925 926 927 928 929 930

	if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
		hwc->config_base = 0;
		hwc->event_base	= 0;
	} else if (hwc->idx >= X86_PMC_IDX_FIXED) {
		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
931
		hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
932
	} else {
933 934
		hwc->config_base = x86_pmu_config_addr(hwc->idx);
		hwc->event_base  = x86_pmu_event_addr(hwc->idx);
935 936 937
	}
}

938 939 940 941 942 943 944 945 946
static inline int match_prev_assignment(struct hw_perf_event *hwc,
					struct cpu_hw_events *cpuc,
					int i)
{
	return hwc->idx == cpuc->assign[i] &&
		hwc->last_cpu == smp_processor_id() &&
		hwc->last_tag == cpuc->tags[i];
}

947 948
static void x86_pmu_start(struct perf_event *event, int flags);
static void x86_pmu_stop(struct perf_event *event, int flags);
949

950
static void x86_pmu_enable(struct pmu *pmu)
951
{
952 953 954
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	struct perf_event *event;
	struct hw_perf_event *hwc;
955
	int i, added = cpuc->n_added;
956

957
	if (!x86_pmu_initialized())
958
		return;
959 960 961 962

	if (cpuc->enabled)
		return;

963
	if (cpuc->n_added) {
964
		int n_running = cpuc->n_events - cpuc->n_added;
965 966 967 968 969 970 971
		/*
		 * apply assignment obtained either from
		 * hw_perf_group_sched_in() or x86_pmu_enable()
		 *
		 * step1: save events moving to new counters
		 * step2: reprogram moved events into new counters
		 */
972
		for (i = 0; i < n_running; i++) {
973 974 975
			event = cpuc->event_list[i];
			hwc = &event->hw;

976 977 978 979 980 981 982 983
			/*
			 * we can avoid reprogramming counter if:
			 * - assigned same counter as last time
			 * - running on same CPU as last time
			 * - no other event has used the counter since
			 */
			if (hwc->idx == -1 ||
			    match_prev_assignment(hwc, cpuc, i))
984 985
				continue;

986 987 988 989 990 991 992 993
			/*
			 * Ensure we don't accidentally enable a stopped
			 * counter simply because we rescheduled.
			 */
			if (hwc->state & PERF_HES_STOPPED)
				hwc->state |= PERF_HES_ARCH;

			x86_pmu_stop(event, PERF_EF_UPDATE);
994 995 996 997 998 999
		}

		for (i = 0; i < cpuc->n_events; i++) {
			event = cpuc->event_list[i];
			hwc = &event->hw;

1000
			if (!match_prev_assignment(hwc, cpuc, i))
1001
				x86_assign_hw_event(event, cpuc, i);
1002 1003
			else if (i < n_running)
				continue;
1004

1005 1006 1007 1008
			if (hwc->state & PERF_HES_ARCH)
				continue;

			x86_pmu_start(event, PERF_EF_RELOAD);
1009 1010 1011 1012
		}
		cpuc->n_added = 0;
		perf_events_lapic_init();
	}
1013 1014 1015 1016

	cpuc->enabled = 1;
	barrier();

1017
	x86_pmu.enable_all(added);
1018 1019
}

1020
static inline void x86_pmu_disable_event(struct perf_event *event)
1021
{
1022
	struct hw_perf_event *hwc = &event->hw;
1023

1024
	wrmsrl(hwc->config_base, hwc->config);
1025 1026
}

1027
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1028

1029 1030
/*
 * Set the next IRQ period, based on the hwc->period_left value.
1031
 * To be called with the event disabled in hw:
1032
 */
1033
static int
1034
x86_perf_event_set_period(struct perf_event *event)
1035
{
1036
	struct hw_perf_event *hwc = &event->hw;
1037
	s64 left = local64_read(&hwc->period_left);
1038
	s64 period = hwc->sample_period;
1039
	int ret = 0, idx = hwc->idx;
1040

1041 1042 1043
	if (idx == X86_PMC_IDX_FIXED_BTS)
		return 0;

1044
	/*
1045
	 * If we are way outside a reasonable range then just skip forward:
1046 1047 1048
	 */
	if (unlikely(left <= -period)) {
		left = period;
1049
		local64_set(&hwc->period_left, left);
1050
		hwc->last_period = period;
1051
		ret = 1;
1052 1053 1054 1055
	}

	if (unlikely(left <= 0)) {
		left += period;
1056
		local64_set(&hwc->period_left, left);
1057
		hwc->last_period = period;
1058
		ret = 1;
1059
	}
1060
	/*
1061
	 * Quirk: certain CPUs dont like it if just 1 hw_event is left:
1062 1063 1064
	 */
	if (unlikely(left < 2))
		left = 2;
1065

1066 1067 1068
	if (left > x86_pmu.max_period)
		left = x86_pmu.max_period;

1069
	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
1070 1071

	/*
1072
	 * The hw event starts counting from this event offset,
1073 1074
	 * mark it to be able to extra future deltas:
	 */
1075
	local64_set(&hwc->prev_count, (u64)-left);
1076

1077
	wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
1078 1079 1080 1081 1082 1083 1084

	/*
	 * Due to erratum on certan cpu we need
	 * a second write to be sure the register
	 * is updated properly
	 */
	if (x86_pmu.perfctr_second_write) {
1085
		wrmsrl(hwc->event_base,
1086
			(u64)(-left) & x86_pmu.cntval_mask);
1087
	}
1088

1089
	perf_event_update_userpage(event);
1090

1091
	return ret;
1092 1093
}

1094
static void x86_pmu_enable_event(struct perf_event *event)
1095
{
1096
	if (__this_cpu_read(cpu_hw_events.enabled))
1097 1098
		__x86_pmu_enable_event(&event->hw,
				       ARCH_PERFMON_EVENTSEL_ENABLE);
1099 1100
}

1101
/*
1102
 * Add a single event to the PMU.
1103 1104 1105
 *
 * The event is added to the group of enabled events
 * but only if it can be scehduled with existing events.
1106
 */
1107
static int x86_pmu_add(struct perf_event *event, int flags)
1108 1109
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1110 1111 1112
	struct hw_perf_event *hwc;
	int assign[X86_PMC_IDX_MAX];
	int n, n0, ret;
1113

1114
	hwc = &event->hw;
1115

Peter Zijlstra's avatar
Peter Zijlstra committed
1116
	perf_pmu_disable(event->pmu);
1117
	n0 = cpuc->n_events;
1118 1119 1120
	ret = n = collect_events(cpuc, event, false);
	if (ret < 0)
		goto out;
1121

1122 1123 1124 1125
	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
	if (!(flags & PERF_EF_START))
		hwc->state |= PERF_HES_ARCH;

1126 1127
	/*
	 * If group events scheduling transaction was started,
1128
	 * skip the schedulability test here, it will be performed
1129
	 * at commit time (->commit_txn) as a whole
1130
	 */
1131
	if (cpuc->group_flag & PERF_EVENT_TXN)
1132
		goto done_collect;
1133

1134
	ret = x86_pmu.schedule_events(cpuc, n, assign);
1135
	if (ret)
1136
		goto out;
1137 1138 1139 1140 1141
	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));
1142

1143
done_collect:
1144
	cpuc->n_events = n;
1145
	cpuc->n_added += n - n0;
1146
	cpuc->n_txn += n - n0;
1147

1148 1149
	ret = 0;
out:
Peter Zijlstra's avatar
Peter Zijlstra committed
1150
	perf_pmu_enable(event->pmu);
1151
	return ret;
1152 1153
}

1154
static void x86_pmu_start(struct perf_event *event, int flags)
1155
{
1156 1157 1158
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int idx = event->hw.idx;

1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170
	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
		return;

	if (WARN_ON_ONCE(idx == -1))
		return;

	if (flags & PERF_EF_RELOAD) {
		WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
		x86_perf_event_set_period(event);
	}

	event->hw.state = 0;
1171

1172 1173
	cpuc->events[idx] = event;
	__set_bit(idx, cpuc->active_mask);
1174
	__set_bit(idx, cpuc->running);
1175
	x86_pmu.enable(event);
1176
	perf_event_update_userpage(event);
1177 1178
}

1179
void perf_event_print_debug(void)
1180
{
1181
	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
1182
	u64 pebs;
1183
	struct cpu_hw_events *cpuc;
1184
	unsigned long flags;
1185 1186
	int cpu, idx;

1187
	if (!x86_pmu.num_counters)
1188
		return;
1189

1190
	local_irq_save(flags);
1191 1192

	cpu = smp_processor_id();
1193
	cpuc = &per_cpu(cpu_hw_events, cpu);
1194

1195
	if (x86_pmu.version >= 2) {
1196 1197 1198 1199
		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
1200
		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
1201 1202 1203 1204 1205 1206

		pr_info("\n");
		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
1207
		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
1208
	}
1209
	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1210

1211
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1212 1213
		rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
		rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1214

1215
		prev_left = per_cpu(pmc_prev_left[idx], cpu);
1216

1217
		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
1218
			cpu, idx, pmc_ctrl);
1219
		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
1220
			cpu, idx, pmc_count);
1221
		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
1222
			cpu, idx, prev_left);
1223
	}
1224
	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
1225 1226
		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);

1227
		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
1228 1229
			cpu, idx, pmc_count);
	}
1230
	local_irq_restore(flags);
1231 1232
}

1233
static void x86_pmu_stop(struct perf_event *event, int flags)
1234
{
1235
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1236
	struct hw_perf_event *hwc = &event->hw;
1237

1238 1239 1240 1241 1242 1243
	if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
		x86_pmu.disable(event);
		cpuc->events[hwc->idx] = NULL;
		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
		hwc->state |= PERF_HES_STOPPED;
	}
1244

1245 1246 1247 1248 1249 1250 1251 1252
	if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
		/*
		 * Drain the remaining delta count out of a event
		 * that we are disabling:
		 */
		x86_perf_event_update(event);
		hwc->state |= PERF_HES_UPTODATE;
	}
1253 1254
}

1255
static void x86_pmu_del(struct perf_event *event, int flags)
1256 1257 1258 1259
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int i;

1260 1261 1262 1263 1264
	/*
	 * If we're called during a txn, we don't need to do anything.
	 * The events never got scheduled and ->cancel_txn will truncate
	 * the event_list.
	 */
1265
	if (cpuc->group_flag & PERF_EVENT_TXN)
1266 1267
		return;

1268
	x86_pmu_stop(event, PERF_EF_UPDATE);
1269

1270 1271 1272 1273 1274 1275 1276 1277 1278 1279
	for (i = 0; i < cpuc->n_events; i++) {
		if (event == cpuc->event_list[i]) {

			if (x86_pmu.put_event_constraints)
				x86_pmu.put_event_constraints(cpuc, event);

			while (++i < cpuc->n_events)
				cpuc->event_list[i-1] = cpuc->event_list[i];

			--cpuc->n_events;
1280
			break;
1281 1282
		}
	}
1283
	perf_event_update_userpage(event);
1284 1285
}

1286
static int x86_pmu_handle_irq(struct pt_regs *regs)
1287
{
1288
	struct perf_sample_data data;
1289 1290
	struct cpu_hw_events *cpuc;
	struct perf_event *event;
1291
	int idx, handled = 0;
1292 1293
	u64 val;

1294
	perf_sample_data_init(&data, 0);
1295

1296
	cpuc = &__get_cpu_var(cpu_hw_events);
1297

1298 1299 1300 1301 1302 1303 1304 1305 1306 1307
	/*
	 * Some chipsets need to unmask the LVTPC in a particular spot
	 * inside the nmi handler.  As a result, the unmasking was pushed
	 * into all the nmi handlers.
	 *
	 * This generic handler doesn't seem to have any issues where the
	 * unmasking occurs so it was left at the top.
	 */
	apic_write(APIC_LVTPC, APIC_DM_NMI);

1308
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1309 1310 1311 1312 1313 1314 1315 1316
		if (!test_bit(idx, cpuc->active_mask)) {
			/*
			 * Though we deactivated the counter some cpus
			 * might still deliver spurious interrupts still
			 * in flight. Catch them:
			 */
			if (__test_and_clear_bit(idx, cpuc->running))
				handled++;
1317
			continue;
1318
		}
1319

1320
		event = cpuc->events[idx];
1321

1322
		val = x86_perf_event_update(event);
1323
		if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
1324
			continue;
1325

1326
		/*
1327
		 * event overflow
1328
		 */
1329
		handled++;
1330
		data.period	= event->hw.last_period;
1331

1332
		if (!x86_perf_event_set_period(event))
1333 1334
			continue;

1335
		if (perf_event_overflow(event, 1, &data, regs))
1336
			x86_pmu_stop(event, 0);
1337
	}
1338

1339 1340 1341
	if (handled)
		inc_irq_stat(apic_perf_irqs);

1342 1343
	return handled;
}
1344

1345
void perf_events_lapic_init(void)
1346
{
1347
	if (!x86_pmu.apic || !x86_pmu_initialized())
1348
		return;
1349

1350
	/*
1351
	 * Always use NMI for PMU
1352
	 */
1353
	apic_write(APIC_LVTPC, APIC_DM_NMI);
1354 1355
}

1356 1357 1358 1359 1360 1361 1362
struct pmu_nmi_state {
	unsigned int	marked;
	int		handled;
};

static DEFINE_PER_CPU(struct pmu_nmi_state, pmu_nmi);

1363
static int __kprobes
1364
perf_event_nmi_handler(struct notifier_block *self,
1365 1366 1367
			 unsigned long cmd, void *__args)
{
	struct die_args *args = __args;
1368 1369
	unsigned int this_nmi;
	int handled;
1370

1371
	if (!atomic_read(&active_events))
1372 1373
		return NOTIFY_DONE;

1374 1375 1376
	switch (cmd) {
	case DIE_NMI:
		break;
1377 1378
	case DIE_NMIUNKNOWN:
		this_nmi = percpu_read(irq_stat.__nmi_count);
1379
		if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
			/* let the kernel handle the unknown nmi */
			return NOTIFY_DONE;
		/*
		 * This one is a PMU back-to-back nmi. Two events
		 * trigger 'simultaneously' raising two back-to-back
		 * NMIs. If the first NMI handles both, the latter
		 * will be empty and daze the CPU. So, we drop it to
		 * avoid false-positive 'unknown nmi' messages.
		 */
		return NOTIFY_STOP;
1390
	default:
1391
		return NOTIFY_DONE;
1392
	}
1393

1394 1395 1396 1397 1398 1399 1400
	handled = x86_pmu.handle_irq(args->regs);
	if (!handled)
		return NOTIFY_DONE;

	this_nmi = percpu_read(irq_stat.__nmi_count);
	if ((handled > 1) ||
		/* the next nmi could be a back-to-back nmi */
1401 1402
	    ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
	     (__this_cpu_read(pmu_nmi.handled) > 1))) {
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412
		/*
		 * We could have two subsequent back-to-back nmis: The
		 * first handles more than one counter, the 2nd
		 * handles only one counter and the 3rd handles no
		 * counter.
		 *
		 * This is the 2nd nmi because the previous was
		 * handling more than one counter. We will mark the
		 * next (3rd) and then drop it if unhandled.
		 */
1413 1414
		__this_cpu_write(pmu_nmi.marked, this_nmi + 1);
		__this_cpu_write(pmu_nmi.handled, handled);
1415
	}
1416

1417
	return NOTIFY_STOP;
1418 1419
}

1420 1421 1422
static __read_mostly struct notifier_block perf_event_nmi_notifier = {
	.notifier_call		= perf_event_nmi_handler,
	.next			= NULL,
1423
	.priority		= NMI_LOCAL_LOW_PRIOR,
1424 1425
};

1426
static struct event_constraint unconstrained;
1427
static struct event_constraint emptyconstraint;
1428 1429

static struct event_constraint *
1430
x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1431
{
1432
	struct event_constraint *c;
1433 1434 1435

	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1436 1437
			if ((event->hw.config & c->cmask) == c->code)
				return c;
1438 1439
		}
	}
1440 1441

	return &unconstrained;
1442 1443
}

1444 1445
#include "perf_event_amd.c"
#include "perf_event_p6.c"
1446
#include "perf_event_p4.c"
1447
#include "perf_event_intel_lbr.c"
1448
#include "perf_event_intel_ds.c"
1449
#include "perf_event_intel.c"
1450

1451 1452 1453 1454
static int __cpuinit
x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
{
	unsigned int cpu = (long)hcpu;
1455
	int ret = NOTIFY_OK;
1456 1457 1458 1459

	switch (action & ~CPU_TASKS_FROZEN) {
	case CPU_UP_PREPARE:
		if (x86_pmu.cpu_prepare)
1460
			ret = x86_pmu.cpu_prepare(cpu);
1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472
		break;

	case CPU_STARTING:
		if (x86_pmu.cpu_starting)
			x86_pmu.cpu_starting(cpu);
		break;

	case CPU_DYING:
		if (x86_pmu.cpu_dying)
			x86_pmu.cpu_dying(cpu);
		break;

1473
	case CPU_UP_CANCELED:
1474 1475 1476 1477 1478 1479 1480 1481 1482
	case CPU_DEAD:
		if (x86_pmu.cpu_dead)
			x86_pmu.cpu_dead(cpu);
		break;

	default:
		break;
	}

1483
	return ret;
1484 1485
}

1486 1487 1488 1489 1490 1491 1492 1493 1494 1495
static void __init pmu_check_apic(void)
{
	if (cpu_has_apic)
		return;

	x86_pmu.apic = 0;
	pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
	pr_info("no hardware sampling interrupt available.\n");
}

1496
static int __init init_hw_perf_events(void)
1497
{
1498
	struct event_constraint *c;
1499 1500
	int err;

1501
	pr_info("Performance Events: ");
1502

1503 1504
	switch (boot_cpu_data.x86_vendor) {
	case X86_VENDOR_INTEL:
1505
		err = intel_pmu_init();
1506
		break;
1507
	case X86_VENDOR_AMD:
1508
		err = amd_pmu_init();
1509
		break;
1510
	default:
1511
		return 0;
1512
	}
1513
	if (err != 0) {
1514
		pr_cont("no PMU driver, software events only.\n");
1515
		return 0;
1516
	}
1517

1518 1519
	pmu_check_apic();

1520
	/* sanity check that the hardware exists or is emulated */
1521
	if (!check_hw_exists())
1522
		return 0;
1523

1524
	pr_cont("%s PMU driver.\n", x86_pmu.name);
1525

1526 1527 1528
	if (x86_pmu.quirks)
		x86_pmu.quirks();

1529
	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
1530
		WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
1531 1532
		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1533
	}
1534
	x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1535

1536
	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1537
		WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
1538 1539
		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
1540
	}
1541

1542
	x86_pmu.intel_ctrl |=
1543
		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
1544

1545 1546
	perf_events_lapic_init();
	register_die_notifier(&perf_event_nmi_notifier);
1547

1548
	unconstrained = (struct event_constraint)
1549 1550
		__EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
				   0, x86_pmu.num_counters);
1551

1552 1553
	if (x86_pmu.event_constraints) {
		for_each_event_constraint(c, x86_pmu.event_constraints) {
1554
			if (c->cmask != X86_RAW_EVENT_MASK)
1555 1556
				continue;

1557 1558
			c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
			c->weight += x86_pmu.num_counters;
1559 1560 1561
		}
	}

1562
	pr_info("... version:                %d\n",     x86_pmu.version);
1563 1564 1565
	pr_info("... bit width:              %d\n",     x86_pmu.cntval_bits);
	pr_info("... generic registers:      %d\n",     x86_pmu.num_counters);
	pr_info("... value mask:             %016Lx\n", x86_pmu.cntval_mask);
1566
	pr_info("... max period:             %016Lx\n", x86_pmu.max_period);
1567
	pr_info("... fixed-purpose events:   %d\n",     x86_pmu.num_counters_fixed);
1568
	pr_info("... event mask:             %016Lx\n", x86_pmu.intel_ctrl);
1569

Peter Zijlstra's avatar
Peter Zijlstra committed
1570
	perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1571
	perf_cpu_notifier(x86_pmu_notifier);
1572 1573

	return 0;
1574
}
1575
early_initcall(init_hw_perf_events);
Ingo Molnar's avatar
Ingo Molnar committed
1576

1577
static inline void x86_pmu_read(struct perf_event *event)
1578
{
1579
	x86_perf_event_update(event);
1580 1581
}

1582 1583 1584 1585 1586
/*
 * Start group events scheduling transaction
 * Set the flag to make pmu::enable() not perform the
 * schedulability test, it will be performed at commit time
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
1587
static void x86_pmu_start_txn(struct pmu *pmu)
1588
{
Peter Zijlstra's avatar
Peter Zijlstra committed
1589
	perf_pmu_disable(pmu);
1590 1591
	__this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
	__this_cpu_write(cpu_hw_events.n_txn, 0);
1592 1593 1594 1595 1596 1597 1598
}

/*
 * Stop group events scheduling transaction
 * Clear the flag and pmu::enable() will perform the
 * schedulability test.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
1599
static void x86_pmu_cancel_txn(struct pmu *pmu)
1600
{
1601
	__this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1602 1603 1604
	/*
	 * Truncate the collected events.
	 */
1605 1606
	__this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
	__this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
Peter Zijlstra's avatar
Peter Zijlstra committed
1607
	perf_pmu_enable(pmu);
1608 1609 1610 1611 1612 1613 1614
}

/*
 * Commit group events scheduling transaction
 * Perform the group schedulability test as a whole
 * Return 0 if success
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
1615
static int x86_pmu_commit_txn(struct pmu *pmu)
1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635
{
	struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
	int assign[X86_PMC_IDX_MAX];
	int n, ret;

	n = cpuc->n_events;

	if (!x86_pmu_initialized())
		return -EAGAIN;

	ret = x86_pmu.schedule_events(cpuc, n, assign);
	if (ret)
		return ret;

	/*
	 * copy new assignment, now we know it is possible
	 * will be used by hw_perf_enable()
	 */
	memcpy(cpuc->assign, assign, n*sizeof(int));

1636
	cpuc->group_flag &= ~PERF_EVENT_TXN;
Peter Zijlstra's avatar
Peter Zijlstra committed
1637
	perf_pmu_enable(pmu);
1638 1639 1640
	return 0;
}

1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666
/*
 * validate that we can schedule this event
 */
static int validate_event(struct perf_event *event)
{
	struct cpu_hw_events *fake_cpuc;
	struct event_constraint *c;
	int ret = 0;

	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		return -ENOMEM;

	c = x86_pmu.get_event_constraints(fake_cpuc, event);

	if (!c || !c->weight)
		ret = -ENOSPC;

	if (x86_pmu.put_event_constraints)
		x86_pmu.put_event_constraints(fake_cpuc, event);

	kfree(fake_cpuc);

	return ret;
}

1667 1668 1669 1670
/*
 * validate a single event group
 *
 * validation include:
1671 1672 1673
 *	- check events are compatible which each other
 *	- events do not compete for the same counter
 *	- number of events <= number of counters
1674 1675 1676 1677
 *
 * validation ensures the group can be loaded onto the
 * PMU if it was the only group available.
 */
1678 1679
static int validate_group(struct perf_event *event)
{
1680
	struct perf_event *leader = event->group_leader;
1681 1682
	struct cpu_hw_events *fake_cpuc;
	int ret, n;
1683

1684 1685 1686 1687
	ret = -ENOMEM;
	fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
	if (!fake_cpuc)
		goto out;
1688

1689 1690 1691 1692 1693 1694
	/*
	 * the event is not yet connected with its
	 * siblings therefore we must first collect
	 * existing siblings, then add the new event
	 * before we can simulate the scheduling
	 */
1695 1696
	ret = -ENOSPC;
	n = collect_events(fake_cpuc, leader, true);
1697
	if (n < 0)
1698
		goto out_free;
1699

1700 1701
	fake_cpuc->n_events = n;
	n = collect_events(fake_cpuc, event, false);
1702
	if (n < 0)
1703
		goto out_free;
1704

1705
	fake_cpuc->n_events = n;
1706

1707
	ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
1708 1709 1710 1711 1712

out_free:
	kfree(fake_cpuc);
out:
	return ret;
1713 1714
}

1715
static int x86_pmu_event_init(struct perf_event *event)
Ingo Molnar's avatar
Ingo Molnar committed
1716
{
Peter Zijlstra's avatar
Peter Zijlstra committed
1717
	struct pmu *tmp;
Ingo Molnar's avatar
Ingo Molnar committed
1718 1719
	int err;

1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
	switch (event->attr.type) {
	case PERF_TYPE_RAW:
	case PERF_TYPE_HARDWARE:
	case PERF_TYPE_HW_CACHE:
		break;

	default:
		return -ENOENT;
	}

	err = __x86_pmu_event_init(event);
1731
	if (!err) {
1732 1733 1734 1735 1736 1737 1738 1739
		/*
		 * we temporarily connect event to its pmu
		 * such that validate_group() can classify
		 * it as an x86 event using is_x86_event()
		 */
		tmp = event->pmu;
		event->pmu = &pmu;

1740 1741
		if (event->group_leader != event)
			err = validate_group(event);
1742 1743
		else
			err = validate_event(event);
1744 1745

		event->pmu = tmp;
1746
	}
1747
	if (err) {
1748 1749
		if (event->destroy)
			event->destroy(event);
1750
	}
Ingo Molnar's avatar
Ingo Molnar committed
1751

1752
	return err;
Ingo Molnar's avatar
Ingo Molnar committed
1753
}
1754

1755
static struct pmu pmu = {
1756 1757 1758
	.pmu_enable	= x86_pmu_enable,
	.pmu_disable	= x86_pmu_disable,

1759
	.event_init	= x86_pmu_event_init,
1760 1761 1762

	.add		= x86_pmu_add,
	.del		= x86_pmu_del,
1763 1764 1765
	.start		= x86_pmu_start,
	.stop		= x86_pmu_stop,
	.read		= x86_pmu_read,
1766

1767 1768 1769 1770 1771
	.start_txn	= x86_pmu_start_txn,
	.cancel_txn	= x86_pmu_cancel_txn,
	.commit_txn	= x86_pmu_commit_txn,
};

1772 1773 1774 1775 1776 1777
/*
 * callchain support
 */

static int backtrace_stack(void *data, char *name)
{
1778
	return 0;
1779 1780 1781 1782 1783 1784
}

static void backtrace_address(void *data, unsigned long addr, int reliable)
{
	struct perf_callchain_entry *entry = data;

1785
	perf_callchain_store(entry, addr);
1786 1787 1788 1789 1790
}

static const struct stacktrace_ops backtrace_ops = {
	.stack			= backtrace_stack,
	.address		= backtrace_address,
1791
	.walk_stack		= print_context_stack_bp,
1792 1793
};

1794 1795
void
perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1796
{
1797 1798
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1799
		return;
1800 1801
	}

1802
	perf_callchain_store(entry, regs->ip);
1803

1804
	dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1805 1806
}

1807 1808 1809
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1810
{
1811 1812 1813
	/* 32-bit process in 64-bit kernel. */
	struct stack_frame_ia32 frame;
	const void __user *fp;
1814

1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826
	if (!test_thread_flag(TIF_IA32))
		return 0;

	fp = compat_ptr(regs->bp);
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
		unsigned long bytes;
		frame.next_frame     = 0;
		frame.return_address = 0;

		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
			break;
1827

1828 1829
		if (fp < compat_ptr(regs->sp))
			break;
1830

1831
		perf_callchain_store(entry, frame.return_address);
1832 1833 1834
		fp = compat_ptr(frame.next_frame);
	}
	return 1;
1835
}
1836 1837 1838 1839 1840 1841 1842
#else
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
    return 0;
}
#endif
1843

1844 1845
void
perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1846 1847 1848 1849
{
	struct stack_frame frame;
	const void __user *fp;

1850 1851
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
		/* TODO: We don't support guest os callchain now */
1852
		return;
1853
	}
1854

1855
	fp = (void __user *)regs->bp;
1856

1857
	perf_callchain_store(entry, regs->ip);
1858

1859 1860 1861
	if (perf_callchain_user32(regs, entry))
		return;

1862
	while (entry->nr < PERF_MAX_STACK_DEPTH) {
1863
		unsigned long bytes;
1864
		frame.next_frame	     = NULL;
1865 1866
		frame.return_address = 0;

1867 1868
		bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
		if (bytes != sizeof(frame))
1869 1870
			break;

1871
		if ((unsigned long)fp < regs->sp)
1872 1873
			break;

1874
		perf_callchain_store(entry, frame.return_address);
1875
		fp = frame.next_frame;
1876 1877 1878
	}
}

1879 1880 1881
unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
	unsigned long ip;
1882

1883 1884 1885 1886
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
		ip = perf_guest_cbs->get_guest_ip();
	else
		ip = instruction_pointer(regs);
1887

1888 1889 1890 1891 1892 1893
	return ip;
}

unsigned long perf_misc_flags(struct pt_regs *regs)
{
	int misc = 0;
1894

1895
	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906
		if (perf_guest_cbs->is_user_mode())
			misc |= PERF_RECORD_MISC_GUEST_USER;
		else
			misc |= PERF_RECORD_MISC_GUEST_KERNEL;
	} else {
		if (user_mode(regs))
			misc |= PERF_RECORD_MISC_USER;
		else
			misc |= PERF_RECORD_MISC_KERNEL;
	}

1907
	if (regs->flags & PERF_EFLAGS_EXACT)
1908
		misc |= PERF_RECORD_MISC_EXACT_IP;
1909 1910 1911

	return misc;
}