Commit c47d122c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'perf-tools-fixes-for-v6.4-1-2023-05-20' of...

Merge tag 'perf-tools-fixes-for-v6.4-1-2023-05-20' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux

Pull perf tools fixes from Arnaldo Carvalho de Melo:

 - Fail graciously if BUILD_BPF_SKEL=1 is specified and clang isn't
   available

 - Add empty 'struct rq' to 'perf lock contention' to satisfy libbpf
   'runqueue' type verification. This feature is built only with
   BUILD_BPF_SKEL=1

 - Make vmlinux.h use bpf.h and perf_event.h in source directory, not
   system ones that may be old and not have things like 'union
   perf_sample_weight'

 - Add system include paths to BPF builds to pick things missing in the
   headers included by clang -target bpf

 - Update various header copies with the kernel sources

 - Change divide by zero and not supported events behavior to show
   'nan'/'not counted' in 'perf stat' output.

   This happens when using things like 'perf stat -M TopdownL2 true',
   involving JSON metrics

 - Update no event/metric expectations affected by using JSON metrics in
   'perf stat -ddd' perf test

 - Avoid segv with 'perf stat --topdown' for metrics without a group

 - Do not assume which events may have a PMU name, allowing the logic to
   keep an AUX event group together. Makes this usecase work again:

     $ perf record --no-bpf-event -c 10 -e '{intel_pt//,tlb_flush.stlb_any/aux-sample-size=8192/pp}:u' -- sleep 0.1
     [ perf record: Woken up 1 times to write data ]
     [ perf record: Captured and wrote 0.078 MB perf.data ]
     $ perf script -F-dso,+addr | grep -C5 tlb_flush.stlb_any | head -11
     sleep 20444 [003]  7939.510243:  1  branches:uH:  7f5350cc82a2 dl_main+0x9a2 => 7f5350cb38f0 _dl_add_to_namespace_list+0x0
     sleep 20444 [003]  7939.510243:  1  branches:uH:  7f5350cb3908 _dl_add_to_namespace_list+0x18 => 7f5350cbb080 rtld_mutex_dummy+0x0
     sleep 20444 [003]  7939.510243:  1  branches:uH:  7f5350cc8350 dl_main+0xa50 => 0 [unknown]
     sleep 20444 [003]  7939.510244:  1  branches:uH:  7f5350cc83ca dl_main+0xaca => 7f5350caeb60 _dl_process_pt_gnu_property+0x0
     sleep 20444 [003]  7939.510245:  1  branches:uH:  7f5350caeb60 _dl_process_pt_gnu_property+0x0 => 0 [unknown]
     sleep 20444  7939.510245:       10 tlb_flush.stlb_any/aux-sample-size=8192/pp: 0 7f5350caeb60 _dl_process_pt_gnu_property+0x0
     sleep 20444 [003]  7939.510254:  1  branches:uH:  7f5350cc87fe dl_main+0xefe => 7f5350ccd240 strcmp+0x0
     sleep 20444 [003]  7939.510254:  1  branches:uH:  7f5350cc8862 dl_main+0xf62 => 0 [unknown]

 - Add a check for the above use case in 'perf test test_intel_pt'

 - Fix build with refcount checking on arm64, it was still accessing
   fields that need to be wrapped so that the refcounted struct gets
   checked

 - Fix contextid validation in ARM's CS-ETM, so that older kernels
   without that field can still be supported

 - Skip unsupported aggregation for stat events found in perf.data files
   in 'perf script'

 - Add stat test for record and script to check the previous problem

 - Remove needless debuginfod queries from 'perf test java symbol', this
   was just making the test take a long time to complete

 - Address python SafeConfigParser() deprecation warning in 'perf test
   attr'

 - Fix __NR_execve undeclared on i386 'perf bench syscall' build error

* tag 'perf-tools-fixes-for-v6.4-1-2023-05-20' of git://git.kernel.org/pub/scm/linux/kernel/git/acme/linux: (33 commits)
  perf bench syscall: Fix __NR_execve undeclared build error
  perf test attr: Fix python SafeConfigParser() deprecation warning
  perf test attr: Update no event/metric expectations
  tools headers disabled-features: Sync with the kernel sources
  tools headers UAPI: Sync arch prctl headers with the kernel sources
  tools headers: Update the copy of x86's mem{cpy,set}_64.S used in 'perf bench'
  tools headers x86 cpufeatures: Sync with the kernel sources
  tools headers UAPI: Sync s390 syscall table file that wires up the memfd_secret syscall
  tools headers UAPI: Sync linux/prctl.h with the kernel sources
  perf metrics: Avoid segv with --topdown for metrics without a group
  perf lock contention: Add empty 'struct rq' to satisfy libbpf 'runqueue' type verification
  perf cs-etm: Fix contextid validation
  perf arm64: Fix build with refcount checking
  perf test: Add stat test for record and script
  perf script: Skip aggregation for stat events
  perf build: Add system include paths to BPF builds
  perf bpf skels: Make vmlinux.h use bpf.h and perf_event.h in source directory
  perf parse-events: Do not break up AUX event group
  perf test test_intel_pt.sh: Test sample mode with event with PMU name
  perf evsel: Modify group pmu name for software events
  ...
parents 4927cb98 4e111f0c
...@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags { ...@@ -198,6 +198,15 @@ struct kvm_arm_copy_mte_tags {
__u64 reserved[2]; __u64 reserved[2];
}; };
/*
* Counter/Timer offset structure. Describe the virtual/physical offset.
* To be used with KVM_ARM_SET_COUNTER_OFFSET.
*/
struct kvm_arm_counter_offset {
__u64 counter_offset;
__u64 reserved;
};
#define KVM_ARM_TAGS_TO_GUEST 0 #define KVM_ARM_TAGS_TO_GUEST 0
#define KVM_ARM_TAGS_FROM_GUEST 1 #define KVM_ARM_TAGS_FROM_GUEST 1
...@@ -372,6 +381,10 @@ enum { ...@@ -372,6 +381,10 @@ enum {
#endif #endif
}; };
/* Device Control API on vm fd */
#define KVM_ARM_VM_SMCCC_CTRL 0
#define KVM_ARM_VM_SMCCC_FILTER 0
/* Device Control API: ARM VGIC */ /* Device Control API: ARM VGIC */
#define KVM_DEV_ARM_VGIC_GRP_ADDR 0 #define KVM_DEV_ARM_VGIC_GRP_ADDR 0
#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 #define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1
...@@ -411,6 +424,8 @@ enum { ...@@ -411,6 +424,8 @@ enum {
#define KVM_ARM_VCPU_TIMER_CTRL 1 #define KVM_ARM_VCPU_TIMER_CTRL 1
#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 #define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0
#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 #define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1
#define KVM_ARM_VCPU_TIMER_IRQ_HVTIMER 2
#define KVM_ARM_VCPU_TIMER_IRQ_HPTIMER 3
#define KVM_ARM_VCPU_PVTIME_CTRL 2 #define KVM_ARM_VCPU_PVTIME_CTRL 2
#define KVM_ARM_VCPU_PVTIME_IPA 0 #define KVM_ARM_VCPU_PVTIME_IPA 0
...@@ -469,6 +484,27 @@ enum { ...@@ -469,6 +484,27 @@ enum {
/* run->fail_entry.hardware_entry_failure_reason codes. */ /* run->fail_entry.hardware_entry_failure_reason codes. */
#define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0) #define KVM_EXIT_FAIL_ENTRY_CPU_UNSUPPORTED (1ULL << 0)
enum kvm_smccc_filter_action {
KVM_SMCCC_FILTER_HANDLE = 0,
KVM_SMCCC_FILTER_DENY,
KVM_SMCCC_FILTER_FWD_TO_USER,
#ifdef __KERNEL__
NR_SMCCC_FILTER_ACTIONS
#endif
};
struct kvm_smccc_filter {
__u32 base;
__u32 nr_functions;
__u8 action;
__u8 pad[15];
};
/* arm64-specific KVM_EXIT_HYPERCALL flags */
#define KVM_HYPERCALL_EXIT_SMC (1U << 0)
#define KVM_HYPERCALL_EXIT_16BIT (1U << 1)
#endif #endif
#endif /* __ARM_KVM_H__ */ #endif /* __ARM_KVM_H__ */
...@@ -97,7 +97,7 @@ ...@@ -97,7 +97,7 @@
#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ #define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */
#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ #define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */
#define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */ #define X86_FEATURE_AMD_LBR_V2 ( 3*32+17) /* AMD Last Branch Record Extension Version 2 */
#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ /* FREE, was #define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) "" LFENCE synchronizes RDTSC */
#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ #define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */
#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ #define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */
#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ #define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */
...@@ -226,10 +226,9 @@ ...@@ -226,10 +226,9 @@
/* Virtualization flags: Linux defined, word 8 */ /* Virtualization flags: Linux defined, word 8 */
#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ #define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */
#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ #define X86_FEATURE_FLEXPRIORITY ( 8*32+ 1) /* Intel FlexPriority */
#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ #define X86_FEATURE_EPT ( 8*32+ 2) /* Intel Extended Page Table */
#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ #define X86_FEATURE_VPID ( 8*32+ 3) /* Intel Virtual Processor ID */
#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */
#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */
#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ #define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */
...@@ -307,14 +306,21 @@ ...@@ -307,14 +306,21 @@
#define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */ #define X86_FEATURE_SGX_EDECCSSA (11*32+18) /* "" SGX EDECCSSA user leaf function */
#define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */ #define X86_FEATURE_CALL_DEPTH (11*32+19) /* "" Call depth tracking for RSB stuffing */
#define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */ #define X86_FEATURE_MSR_TSX_CTRL (11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
#define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
#define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
/* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
#define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
#define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */ #define X86_FEATURE_AVX512_BF16 (12*32+ 5) /* AVX512 BFLOAT16 instructions */
#define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */ #define X86_FEATURE_CMPCCXADD (12*32+ 7) /* "" CMPccXADD instructions */
#define X86_FEATURE_ARCH_PERFMON_EXT (12*32+ 8) /* "" Intel Architectural PerfMon Extension */
#define X86_FEATURE_FZRM (12*32+10) /* "" Fast zero-length REP MOVSB */
#define X86_FEATURE_FSRS (12*32+11) /* "" Fast short REP STOSB */
#define X86_FEATURE_FSRC (12*32+12) /* "" Fast short REP {CMPSB,SCASB} */
#define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */ #define X86_FEATURE_LKGS (12*32+18) /* "" Load "kernel" (userspace) GS */
#define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */ #define X86_FEATURE_AMX_FP16 (12*32+21) /* "" AMX fp16 Support */
#define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */ #define X86_FEATURE_AVX_IFMA (12*32+23) /* "" Support for VPMADD52[H,L]UQ */
#define X86_FEATURE_LAM (12*32+26) /* Linear Address Masking */
/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ /* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ #define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */
...@@ -331,6 +337,7 @@ ...@@ -331,6 +337,7 @@
#define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */ #define X86_FEATURE_VIRT_SSBD (13*32+25) /* Virtualized Speculative Store Bypass Disable */
#define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */ #define X86_FEATURE_AMD_SSB_NO (13*32+26) /* "" Speculative Store Bypass is fixed in hardware. */
#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
...@@ -363,6 +370,7 @@ ...@@ -363,6 +370,7 @@
#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ #define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */
#define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */ #define X86_FEATURE_X2AVIC (15*32+18) /* Virtual x2apic */
#define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */ #define X86_FEATURE_V_SPEC_CTRL (15*32+20) /* Virtual SPEC_CTRL */
#define X86_FEATURE_VNMI (15*32+25) /* Virtual NMI */
#define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */ #define X86_FEATURE_SVME_ADDR_CHK (15*32+28) /* "" SVME addr check */
/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
...@@ -427,6 +435,13 @@ ...@@ -427,6 +435,13 @@
#define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */ #define X86_FEATURE_V_TSC_AUX (19*32+ 9) /* "" Virtual TSC_AUX */
#define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */ #define X86_FEATURE_SME_COHERENT (19*32+10) /* "" AMD hardware-enforced cache coherency */
/* AMD-defined Extended Feature 2 EAX, CPUID level 0x80000021 (EAX), word 20 */
#define X86_FEATURE_NO_NESTED_DATA_BP (20*32+ 0) /* "" No Nested Data Breakpoints */
#define X86_FEATURE_LFENCE_RDTSC (20*32+ 2) /* "" LFENCE always serializing / synchronizes RDTSC */
#define X86_FEATURE_NULL_SEL_CLR_BASE (20*32+ 6) /* "" Null Selector Clears Base */
#define X86_FEATURE_AUTOIBRS (20*32+ 8) /* "" Automatic IBRS */
#define X86_FEATURE_NO_SMM_CTL_MSR (20*32+ 9) /* "" SMM_CTL MSR is not present */
/* /*
* BUG word(s) * BUG word(s)
*/ */
...@@ -467,5 +482,6 @@ ...@@ -467,5 +482,6 @@
#define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */ #define X86_BUG_MMIO_UNKNOWN X86_BUG(26) /* CPU is too old and its MMIO Stale Data status is unknown */
#define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */ #define X86_BUG_RETBLEED X86_BUG(27) /* CPU is affected by RETBleed */
#define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #define X86_BUG_EIBRS_PBRSB X86_BUG(28) /* EIBRS is vulnerable to Post Barrier RSB Predictions */
#define X86_BUG_SMT_RSB X86_BUG(29) /* CPU is vulnerable to Cross-Thread Return Address Predictions */
#endif /* _ASM_X86_CPUFEATURES_H */ #endif /* _ASM_X86_CPUFEATURES_H */
...@@ -75,6 +75,12 @@ ...@@ -75,6 +75,12 @@
# define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31)) # define DISABLE_CALL_DEPTH_TRACKING (1 << (X86_FEATURE_CALL_DEPTH & 31))
#endif #endif
#ifdef CONFIG_ADDRESS_MASKING
# define DISABLE_LAM 0
#else
# define DISABLE_LAM (1 << (X86_FEATURE_LAM & 31))
#endif
#ifdef CONFIG_INTEL_IOMMU_SVM #ifdef CONFIG_INTEL_IOMMU_SVM
# define DISABLE_ENQCMD 0 # define DISABLE_ENQCMD 0
#else #else
...@@ -115,7 +121,7 @@ ...@@ -115,7 +121,7 @@
#define DISABLED_MASK10 0 #define DISABLED_MASK10 0
#define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \
DISABLE_CALL_DEPTH_TRACKING) DISABLE_CALL_DEPTH_TRACKING)
#define DISABLED_MASK12 0 #define DISABLED_MASK12 (DISABLE_LAM)
#define DISABLED_MASK13 0 #define DISABLED_MASK13 0
#define DISABLED_MASK14 0 #define DISABLED_MASK14 0
#define DISABLED_MASK15 0 #define DISABLED_MASK15 0
......
...@@ -206,6 +206,8 @@ ...@@ -206,6 +206,8 @@
/* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */ /* Abbreviated from Intel SDM name IA32_INTEGRITY_CAPABILITIES */
#define MSR_INTEGRITY_CAPS 0x000002d9 #define MSR_INTEGRITY_CAPS 0x000002d9
#define MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT 2
#define MSR_INTEGRITY_CAPS_ARRAY_BIST BIT(MSR_INTEGRITY_CAPS_ARRAY_BIST_BIT)
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4 #define MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT 4
#define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT) #define MSR_INTEGRITY_CAPS_PERIODIC_BIST BIT(MSR_INTEGRITY_CAPS_PERIODIC_BIST_BIT)
......
...@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter { ...@@ -559,4 +559,7 @@ struct kvm_pmu_event_filter {
#define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */ #define KVM_VCPU_TSC_CTRL 0 /* control group for the timestamp counter (TSC) */
#define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */ #define KVM_VCPU_TSC_OFFSET 0 /* attribute for the TSC offset */
/* x86-specific KVM_EXIT_HYPERCALL flags. */
#define KVM_EXIT_HYPERCALL_LONG_MODE BIT(0)
#endif /* _ASM_X86_KVM_H */ #endif /* _ASM_X86_KVM_H */
...@@ -16,8 +16,16 @@ ...@@ -16,8 +16,16 @@
#define ARCH_GET_XCOMP_GUEST_PERM 0x1024 #define ARCH_GET_XCOMP_GUEST_PERM 0x1024
#define ARCH_REQ_XCOMP_GUEST_PERM 0x1025 #define ARCH_REQ_XCOMP_GUEST_PERM 0x1025
#define ARCH_XCOMP_TILECFG 17
#define ARCH_XCOMP_TILEDATA 18
#define ARCH_MAP_VDSO_X32 0x2001 #define ARCH_MAP_VDSO_X32 0x2001
#define ARCH_MAP_VDSO_32 0x2002 #define ARCH_MAP_VDSO_32 0x2002
#define ARCH_MAP_VDSO_64 0x2003 #define ARCH_MAP_VDSO_64 0x2003
#define ARCH_GET_UNTAG_MASK 0x4001
#define ARCH_ENABLE_TAGGED_ADDR 0x4002
#define ARCH_GET_MAX_TAG_BITS 0x4003
#define ARCH_FORCE_TAGGED_SVA 0x4004
#endif /* _ASM_X86_PRCTL_H */ #endif /* _ASM_X86_PRCTL_H */
...@@ -2,6 +2,9 @@ ...@@ -2,6 +2,9 @@
#ifndef __NR_fork #ifndef __NR_fork
#define __NR_fork 2 #define __NR_fork 2
#endif #endif
#ifndef __NR_execve
#define __NR_execve 11
#endif
#ifndef __NR_getppid #ifndef __NR_getppid
#define __NR_getppid 64 #define __NR_getppid 64
#endif #endif
......
...@@ -9,13 +9,6 @@ ...@@ -9,13 +9,6 @@
.section .noinstr.text, "ax" .section .noinstr.text, "ax"
/*
* We build a jump to memcpy_orig by default which gets NOPped out on
* the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
* have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
* to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
*/
/* /*
* memcpy - Copy a memory block. * memcpy - Copy a memory block.
* *
...@@ -26,17 +19,21 @@ ...@@ -26,17 +19,21 @@
* *
* Output: * Output:
* rax original destination * rax original destination
*
* The FSRM alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep movsb' itself is small enough to replace the call, but the
* two register moves blow up the code. And one of them is "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/ */
SYM_TYPED_FUNC_START(__memcpy) SYM_TYPED_FUNC_START(__memcpy)
ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ ALTERNATIVE "jmp memcpy_orig", "", X86_FEATURE_FSRM
"jmp memcpy_erms", X86_FEATURE_ERMS
movq %rdi, %rax movq %rdi, %rax
movq %rdx, %rcx movq %rdx, %rcx
shrq $3, %rcx
andl $7, %edx
rep movsq
movl %edx, %ecx
rep movsb rep movsb
RET RET
SYM_FUNC_END(__memcpy) SYM_FUNC_END(__memcpy)
...@@ -45,17 +42,6 @@ EXPORT_SYMBOL(__memcpy) ...@@ -45,17 +42,6 @@ EXPORT_SYMBOL(__memcpy)
SYM_FUNC_ALIAS(memcpy, __memcpy) SYM_FUNC_ALIAS(memcpy, __memcpy)
EXPORT_SYMBOL(memcpy) EXPORT_SYMBOL(memcpy)
/*
* memcpy_erms() - enhanced fast string memcpy. This is faster and
* simpler than memcpy. Use memcpy_erms when possible.
*/
SYM_FUNC_START_LOCAL(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
RET
SYM_FUNC_END(memcpy_erms)
SYM_FUNC_START_LOCAL(memcpy_orig) SYM_FUNC_START_LOCAL(memcpy_orig)
movq %rdi, %rax movq %rdi, %rax
......
...@@ -18,27 +18,22 @@ ...@@ -18,27 +18,22 @@
* rdx count (bytes) * rdx count (bytes)
* *
* rax original destination * rax original destination
*
* The FSRS alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep stosb' itself is small enough to replace the call, but all
* the register moves blow up the code. And two of them are "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/ */
SYM_FUNC_START(__memset) SYM_FUNC_START(__memset)
/* ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
* to use it when possible. If not available, use fast string instructions.
*
* Otherwise, use original memset function.
*/
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS
movq %rdi,%r9 movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx movq %rdx,%rcx
andl $7,%edx
shrq $3,%rcx
/* expand byte value */
movzbl %sil,%esi
movabs $0x0101010101010101,%rax
imulq %rsi,%rax
rep stosq
movl %edx,%ecx
rep stosb rep stosb
movq %r9,%rax movq %r9,%rax
RET RET
...@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset) ...@@ -48,26 +43,6 @@ EXPORT_SYMBOL(__memset)
SYM_FUNC_ALIAS(memset, __memset) SYM_FUNC_ALIAS(memset, __memset)
EXPORT_SYMBOL(memset) EXPORT_SYMBOL(memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
* enhanced rep stosb to override the fast string function.
* The code is simpler and shorter than the fast string function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*/
SYM_FUNC_START_LOCAL(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
RET
SYM_FUNC_END(memset_erms)
SYM_FUNC_START_LOCAL(memset_orig) SYM_FUNC_START_LOCAL(memset_orig)
movq %rdi,%r10 movq %rdi,%r10
......
...@@ -4,7 +4,6 @@ ...@@ -4,7 +4,6 @@
/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
#define altinstruction_entry # #define ALTERNATIVE #
#define ALTERNATIVE_2 #
#endif #endif
...@@ -972,6 +972,19 @@ extern "C" { ...@@ -972,6 +972,19 @@ extern "C" {
#define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats) #define DRM_IOCTL_GET_STATS DRM_IOR( 0x06, struct drm_stats)
#define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version) #define DRM_IOCTL_SET_VERSION DRM_IOWR(0x07, struct drm_set_version)
#define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl) #define DRM_IOCTL_MODESET_CTL DRM_IOW(0x08, struct drm_modeset_ctl)
/**
* DRM_IOCTL_GEM_CLOSE - Close a GEM handle.
*
* GEM handles are not reference-counted by the kernel. User-space is
* responsible for managing their lifetime. For example, if user-space imports
* the same memory object twice on the same DRM file description, the same GEM
* handle is returned by both imports, and user-space needs to ensure
* &DRM_IOCTL_GEM_CLOSE is performed once only. The same situation can happen
* when a memory object is allocated, then exported and imported again on the
* same DRM file description. The &DRM_IOCTL_MODE_GETFB2 IOCTL is an exception
* and always returns fresh new GEM handles even if an existing GEM handle
* already refers to the same memory object before the IOCTL is performed.
*/
#define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close) #define DRM_IOCTL_GEM_CLOSE DRM_IOW (0x09, struct drm_gem_close)
#define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink) #define DRM_IOCTL_GEM_FLINK DRM_IOWR(0x0a, struct drm_gem_flink)
#define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open) #define DRM_IOCTL_GEM_OPEN DRM_IOWR(0x0b, struct drm_gem_open)
...@@ -1012,7 +1025,37 @@ extern "C" { ...@@ -1012,7 +1025,37 @@ extern "C" {
#define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock) #define DRM_IOCTL_UNLOCK DRM_IOW( 0x2b, struct drm_lock)
#define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock) #define DRM_IOCTL_FINISH DRM_IOW( 0x2c, struct drm_lock)
/**
* DRM_IOCTL_PRIME_HANDLE_TO_FD - Convert a GEM handle to a DMA-BUF FD.
*
* User-space sets &drm_prime_handle.handle with the GEM handle to export and
* &drm_prime_handle.flags, and gets back a DMA-BUF file descriptor in
* &drm_prime_handle.fd.
*
* The export can fail for any driver-specific reason, e.g. because export is
* not supported for this specific GEM handle (but might be for others).
*
* Support for exporting DMA-BUFs is advertised via &DRM_PRIME_CAP_EXPORT.
*/
#define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle) #define DRM_IOCTL_PRIME_HANDLE_TO_FD DRM_IOWR(0x2d, struct drm_prime_handle)
/**
* DRM_IOCTL_PRIME_FD_TO_HANDLE - Convert a DMA-BUF FD to a GEM handle.
*
* User-space sets &drm_prime_handle.fd with a DMA-BUF file descriptor to
* import, and gets back a GEM handle in &drm_prime_handle.handle.
* &drm_prime_handle.flags is unused.
*
* If an existing GEM handle refers to the memory object backing the DMA-BUF,
* that GEM handle is returned. Therefore user-space which needs to handle
* arbitrary DMA-BUFs must have a user-space lookup data structure to manually
* reference-count duplicated GEM handles. For more information see
* &DRM_IOCTL_GEM_CLOSE.
*
* The import can fail for any driver-specific reason, e.g. because import is
* only supported for DMA-BUFs allocated on this DRM device.
*
* Support for importing DMA-BUFs is advertised via &DRM_PRIME_CAP_IMPORT.
*/
#define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle) #define DRM_IOCTL_PRIME_FD_TO_HANDLE DRM_IOWR(0x2e, struct drm_prime_handle)
#define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30) #define DRM_IOCTL_AGP_ACQUIRE DRM_IO( 0x30)
...@@ -1104,8 +1147,13 @@ extern "C" { ...@@ -1104,8 +1147,13 @@ extern "C" {
* struct as the output. * struct as the output.
* *
* If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles * If the client is DRM master or has &CAP_SYS_ADMIN, &drm_mode_fb_cmd2.handles
* will be filled with GEM buffer handles. Planes are valid until one has a * will be filled with GEM buffer handles. Fresh new GEM handles are always
* zero handle -- this can be used to compute the number of planes. * returned, even if another GEM handle referring to the same memory object
* already exists on the DRM file description. The caller is responsible for
* removing the new handles, e.g. via the &DRM_IOCTL_GEM_CLOSE IOCTL. The same
* new handle will be returned for multiple planes in case they use the same
* memory object. Planes are valid until one has a zero handle -- this can be
* used to compute the number of planes.
* *
* Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid * Otherwise, &drm_mode_fb_cmd2.handles will be zeroed and planes are valid
* until one has a zero &drm_mode_fb_cmd2.pitches. * until one has a zero &drm_mode_fb_cmd2.pitches.
...@@ -1113,6 +1161,11 @@ extern "C" { ...@@ -1113,6 +1161,11 @@ extern "C" {
* If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set * If the framebuffer has a format modifier, &DRM_MODE_FB_MODIFIERS will be set
* in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the * in &drm_mode_fb_cmd2.flags and &drm_mode_fb_cmd2.modifier will contain the
* modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier. * modifier. Otherwise, user-space must ignore &drm_mode_fb_cmd2.modifier.
*
* To obtain DMA-BUF FDs for each plane without leaking GEM handles, user-space
* can export each handle via &DRM_IOCTL_PRIME_HANDLE_TO_FD, then immediately
* close each unique handle via &DRM_IOCTL_GEM_CLOSE, making sure to not
* double-close handles which are specified multiple times in the array.
*/ */
#define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2) #define DRM_IOCTL_MODE_GETFB2 DRM_IOWR(0xCE, struct drm_mode_fb_cmd2)
......
...@@ -2491,7 +2491,7 @@ struct i915_context_param_engines { ...@@ -2491,7 +2491,7 @@ struct i915_context_param_engines {
#define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */ #define I915_CONTEXT_ENGINES_EXT_LOAD_BALANCE 0 /* see i915_context_engines_load_balance */
#define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */ #define I915_CONTEXT_ENGINES_EXT_BOND 1 /* see i915_context_engines_bond */
#define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */ #define I915_CONTEXT_ENGINES_EXT_PARALLEL_SUBMIT 2 /* see i915_context_engines_parallel_submit */
struct i915_engine_class_instance engines[0]; struct i915_engine_class_instance engines[];
} __attribute__((packed)); } __attribute__((packed));
#define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \ #define I915_DEFINE_CONTEXT_PARAM_ENGINES(name__, N__) struct { \
...@@ -2676,6 +2676,10 @@ enum drm_i915_oa_format { ...@@ -2676,6 +2676,10 @@ enum drm_i915_oa_format {
I915_OAR_FORMAT_A32u40_A4u32_B8_C8, I915_OAR_FORMAT_A32u40_A4u32_B8_C8,
I915_OA_FORMAT_A24u40_A14u32_B8_C8, I915_OA_FORMAT_A24u40_A14u32_B8_C8,
/* MTL OAM */
I915_OAM_FORMAT_MPEC8u64_B8_C8,
I915_OAM_FORMAT_MPEC8u32_B8_C8,
I915_OA_FORMAT_MAX /* non-ABI */ I915_OA_FORMAT_MAX /* non-ABI */
}; };
...@@ -2758,6 +2762,25 @@ enum drm_i915_perf_property_id { ...@@ -2758,6 +2762,25 @@ enum drm_i915_perf_property_id {
*/ */
DRM_I915_PERF_PROP_POLL_OA_PERIOD, DRM_I915_PERF_PROP_POLL_OA_PERIOD,
/**
* Multiple engines may be mapped to the same OA unit. The OA unit is
* identified by class:instance of any engine mapped to it.
*
* This parameter specifies the engine class and must be passed along
* with DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE.
*
* This property is available in perf revision 6.
*/
DRM_I915_PERF_PROP_OA_ENGINE_CLASS,
/**
* This parameter specifies the engine instance and must be passed along
* with DRM_I915_PERF_PROP_OA_ENGINE_CLASS.
*
* This property is available in perf revision 6.
*/
DRM_I915_PERF_PROP_OA_ENGINE_INSTANCE,
DRM_I915_PERF_PROP_MAX /* non-ABI */ DRM_I915_PERF_PROP_MAX /* non-ABI */
}; };
......
...@@ -28,7 +28,7 @@ ...@@ -28,7 +28,7 @@
#define _BITUL(x) (_UL(1) << (x)) #define _BITUL(x) (_UL(1) << (x))
#define _BITULL(x) (_ULL(1) << (x)) #define _BITULL(x) (_ULL(1) << (x))
#define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (typeof(x))(a) - 1) #define __ALIGN_KERNEL(x, a) __ALIGN_KERNEL_MASK(x, (__typeof__(x))(a) - 1)
#define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask)) #define __ALIGN_KERNEL_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) #define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
......
...@@ -162,6 +162,7 @@ struct in_addr { ...@@ -162,6 +162,7 @@ struct in_addr {
#define MCAST_MSFILTER 48 #define MCAST_MSFILTER 48
#define IP_MULTICAST_ALL 49 #define IP_MULTICAST_ALL 49
#define IP_UNICAST_IF 50 #define IP_UNICAST_IF 50
#define IP_LOCAL_PORT_RANGE 51
#define MCAST_EXCLUDE 0 #define MCAST_EXCLUDE 0
#define MCAST_INCLUDE 1 #define MCAST_INCLUDE 1
......
...@@ -341,8 +341,13 @@ struct kvm_run { ...@@ -341,8 +341,13 @@ struct kvm_run {
__u64 nr; __u64 nr;
__u64 args[6]; __u64 args[6];
__u64 ret; __u64 ret;
__u32 longmode;
__u32 pad; union {
#ifndef __KERNEL__
__u32 longmode;
#endif
__u64 flags;
};
} hypercall; } hypercall;
/* KVM_EXIT_TPR_ACCESS */ /* KVM_EXIT_TPR_ACCESS */
struct { struct {
...@@ -1184,6 +1189,7 @@ struct kvm_ppc_resize_hpt { ...@@ -1184,6 +1189,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224 #define KVM_CAP_S390_PROTECTED_ASYNC_DISABLE 224
#define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225
#define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226
#define KVM_CAP_COUNTER_OFFSET 227
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
...@@ -1543,6 +1549,8 @@ struct kvm_s390_ucas_mapping { ...@@ -1543,6 +1549,8 @@ struct kvm_s390_ucas_mapping {
#define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter) #define KVM_SET_PMU_EVENT_FILTER _IOW(KVMIO, 0xb2, struct kvm_pmu_event_filter)
#define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3) #define KVM_PPC_SVM_OFF _IO(KVMIO, 0xb3)
#define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags) #define KVM_ARM_MTE_COPY_TAGS _IOR(KVMIO, 0xb4, struct kvm_arm_copy_mte_tags)
/* Available with KVM_CAP_COUNTER_OFFSET */
#define KVM_ARM_SET_COUNTER_OFFSET _IOW(KVMIO, 0xb5, struct kvm_arm_counter_offset)
/* ioctl for vm fd */ /* ioctl for vm fd */
#define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device)
......
...@@ -290,6 +290,8 @@ struct prctl_mm_map { ...@@ -290,6 +290,8 @@ struct prctl_mm_map {
#define PR_SET_VMA 0x53564d41 #define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0 # define PR_SET_VMA_ANON_NAME 0
#define PR_GET_AUXV 0x41555856
#define PR_SET_MEMORY_MERGE 67 #define PR_SET_MEMORY_MERGE 67
#define PR_GET_MEMORY_MERGE 68 #define PR_GET_MEMORY_MERGE 68
#endif /* _LINUX_PRCTL_H */ #endif /* _LINUX_PRCTL_H */
...@@ -429,9 +429,14 @@ struct snd_pcm_sw_params { ...@@ -429,9 +429,14 @@ struct snd_pcm_sw_params {
snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */ snd_pcm_uframes_t avail_min; /* min avail frames for wakeup */
snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */ snd_pcm_uframes_t xfer_align; /* obsolete: xfer size need to be a multiple */
snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */ snd_pcm_uframes_t start_threshold; /* min hw_avail frames for automatic start */
snd_pcm_uframes_t stop_threshold; /* min avail frames for automatic stop */ /*
snd_pcm_uframes_t silence_threshold; /* min distance from noise for silence filling */ * The following two thresholds alleviate playback buffer underruns; when
snd_pcm_uframes_t silence_size; /* silence block size */ * hw_avail drops below the threshold, the respective action is triggered:
*/
snd_pcm_uframes_t stop_threshold; /* - stop playback */
snd_pcm_uframes_t silence_threshold; /* - pre-fill buffer with silence */
snd_pcm_uframes_t silence_size; /* max size of silence pre-fill; when >= boundary,
* fill played area with silence immediately */
snd_pcm_uframes_t boundary; /* pointers wrap point */ snd_pcm_uframes_t boundary; /* pointers wrap point */
unsigned int proto; /* protocol version */ unsigned int proto; /* protocol version */
unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */ unsigned int tstamp_type; /* timestamp type (req. proto >= 2.0.12) */
...@@ -570,7 +575,8 @@ struct __snd_pcm_mmap_status64 { ...@@ -570,7 +575,8 @@ struct __snd_pcm_mmap_status64 {
struct __snd_pcm_mmap_control64 { struct __snd_pcm_mmap_control64 {
__pad_before_uframe __pad1; __pad_before_uframe __pad1;
snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */ snd_pcm_uframes_t appl_ptr; /* RW: appl ptr (0...boundary-1) */
__pad_before_uframe __pad2; __pad_before_uframe __pad2; // This should be __pad_after_uframe, but binary
// backwards compatibility constraints prevent a fix.
__pad_before_uframe __pad3; __pad_before_uframe __pad3;
snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */ snd_pcm_uframes_t avail_min; /* RW: min available frames for wakeup */
......
...@@ -216,6 +216,12 @@ ifeq ($(call get-executable,$(BISON)),) ...@@ -216,6 +216,12 @@ ifeq ($(call get-executable,$(BISON)),)
dummy := $(error Error: $(BISON) is missing on this system, please install it) dummy := $(error Error: $(BISON) is missing on this system, please install it)
endif endif
ifeq ($(BUILD_BPF_SKEL),1)
ifeq ($(call get-executable,$(CLANG)),)
dummy := $(error $(CLANG) is missing on this system, please install it to be able to build with BUILD_BPF_SKEL=1)
endif
endif
ifneq ($(OUTPUT),) ifneq ($(OUTPUT),)
ifeq ($(shell expr $(shell $(BISON) --version | grep bison | sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \>\= 371), 1) ifeq ($(shell expr $(shell $(BISON) --version | grep bison | sed -e 's/.\+ \([0-9]\+\).\([0-9]\+\).\([0-9]\+\)/\1\2\3/g') \>\= 371), 1)
BISON_FILE_PREFIX_MAP := --file-prefix-map=$(OUTPUT)= BISON_FILE_PREFIX_MAP := --file-prefix-map=$(OUTPUT)=
......
...@@ -1057,14 +1057,32 @@ $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_ ...@@ -1057,14 +1057,32 @@ $(SKEL_TMP_OUT) $(LIBAPI_OUTPUT) $(LIBBPF_OUTPUT) $(LIBPERF_OUTPUT) $(LIBSUBCMD_
ifdef BUILD_BPF_SKEL ifdef BUILD_BPF_SKEL
BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool BPFTOOL := $(SKEL_TMP_OUT)/bootstrap/bpftool
BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(LIBBPF_INCLUDE) # Get Clang's default includes on this system, as opposed to those seen by
# '-target bpf'. This fixes "missing" files on some architectures/distros,
# such as asm/byteorder.h, asm/socket.h, asm/sockios.h, sys/cdefs.h etc.
#
# Use '-idirafter': Don't interfere with include mechanics except where the
# build would have failed anyways.
define get_sys_includes
$(shell $(1) $(2) -v -E - </dev/null 2>&1 \
| sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \
$(shell $(1) $(2) -dM -E - </dev/null | grep '__riscv_xlen ' | awk '{printf("-D__riscv_xlen=%d -D__BITS_PER_LONG=%d", $$3, $$3)}')
endef
ifneq ($(CROSS_COMPILE),)
CLANG_TARGET_ARCH = --target=$(notdir $(CROSS_COMPILE:%-=%))
endif
CLANG_SYS_INCLUDES = $(call get_sys_includes,$(CLANG),$(CLANG_TARGET_ARCH))
BPF_INCLUDE := -I$(SKEL_TMP_OUT)/.. -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES)
TOOLS_UAPI_INCLUDE := -I$(srctree)/tools/include/uapi
$(BPFTOOL): | $(SKEL_TMP_OUT) $(BPFTOOL): | $(SKEL_TMP_OUT)
$(Q)CFLAGS= $(MAKE) -C ../bpf/bpftool \ $(Q)CFLAGS= $(MAKE) -C ../bpf/bpftool \
OUTPUT=$(SKEL_TMP_OUT)/ bootstrap OUTPUT=$(SKEL_TMP_OUT)/ bootstrap
$(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT) $(SKEL_TMP_OUT)/%.bpf.o: util/bpf_skel/%.bpf.c $(LIBBPF) | $(SKEL_TMP_OUT)
$(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) \ $(QUIET_CLANG)$(CLANG) -g -O2 -target bpf -Wall -Werror $(BPF_INCLUDE) $(TOOLS_UAPI_INCLUDE) \
-c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@ -c $(filter util/bpf_skel/%.bpf.c,$^) -o $@ && $(LLVM_STRIP) -g $@
$(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL) $(SKEL_OUT)/%.skel.h: $(SKEL_TMP_OUT)/%.bpf.o | $(BPFTOOL)
......
...@@ -78,9 +78,9 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, ...@@ -78,9 +78,9 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr,
char path[PATH_MAX]; char path[PATH_MAX];
int err; int err;
u32 val; u32 val;
u64 contextid = u64 contextid = evsel->core.attr.config &
evsel->core.attr.config & (perf_pmu__format_bits(&cs_etm_pmu->format, "contextid") |
(perf_pmu__format_bits(&cs_etm_pmu->format, "contextid1") | perf_pmu__format_bits(&cs_etm_pmu->format, "contextid1") |
perf_pmu__format_bits(&cs_etm_pmu->format, "contextid2")); perf_pmu__format_bits(&cs_etm_pmu->format, "contextid2"));
if (!contextid) if (!contextid)
...@@ -114,8 +114,7 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr, ...@@ -114,8 +114,7 @@ static int cs_etm_validate_context_id(struct auxtrace_record *itr,
* 0b00100 Maximum of 32-bit Context ID size. * 0b00100 Maximum of 32-bit Context ID size.
* All other values are reserved. * All other values are reserved.
*/ */
val = BMVAL(val, 5, 9); if (BMVAL(val, 5, 9) != 0x4) {
if (!val || val != 0x4) {
pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n", pr_err("%s: CONTEXTIDR_EL1 isn't supported, disable with %s/contextid1=0/\n",
CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME); CORESIGHT_ETM_PMU_NAME, CORESIGHT_ETM_PMU_NAME);
return -EINVAL; return -EINVAL;
......
...@@ -29,8 +29,8 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus) ...@@ -29,8 +29,8 @@ static int _get_cpuid(char *buf, size_t sz, struct perf_cpu_map *cpus)
char path[PATH_MAX]; char path[PATH_MAX];
FILE *file; FILE *file;
scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d"MIDR, scnprintf(path, PATH_MAX, "%s/devices/system/cpu/cpu%d" MIDR,
sysfs, cpus->map[cpu]); sysfs, RC_CHK_ACCESS(cpus)->map[cpu].cpu);
file = fopen(path, "r"); file = fopen(path, "r");
if (!file) { if (!file) {
......
...@@ -18,7 +18,7 @@ static struct perf_pmu *pmu__find_core_pmu(void) ...@@ -18,7 +18,7 @@ static struct perf_pmu *pmu__find_core_pmu(void)
* The cpumap should cover all CPUs. Otherwise, some CPUs may * The cpumap should cover all CPUs. Otherwise, some CPUs may
* not support some events or have different event IDs. * not support some events or have different event IDs.
*/ */
if (pmu->cpus->nr != cpu__max_cpu().cpu) if (RC_CHK_ACCESS(pmu->cpus)->nr != cpu__max_cpu().cpu)
return NULL; return NULL;
return pmu; return pmu;
......
...@@ -449,7 +449,7 @@ ...@@ -449,7 +449,7 @@
444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset 444 common landlock_create_ruleset sys_landlock_create_ruleset sys_landlock_create_ruleset
445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule 445 common landlock_add_rule sys_landlock_add_rule sys_landlock_add_rule
446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self 446 common landlock_restrict_self sys_landlock_restrict_self sys_landlock_restrict_self
# 447 reserved for memfd_secret 447 common memfd_secret sys_memfd_secret sys_memfd_secret
448 common process_mrelease sys_process_mrelease sys_process_mrelease 448 common process_mrelease sys_process_mrelease sys_process_mrelease
449 common futex_waitv sys_futex_waitv sys_futex_waitv 449 common futex_waitv sys_futex_waitv sys_futex_waitv
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
...@@ -7,7 +7,3 @@ MEMCPY_FN(memcpy_orig, ...@@ -7,7 +7,3 @@ MEMCPY_FN(memcpy_orig,
MEMCPY_FN(__memcpy, MEMCPY_FN(__memcpy,
"x86-64-movsq", "x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S") "movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
MEMCPY_FN(memcpy_erms,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
/* Various wrappers to make the kernel .S file build in user-space: */ /* Various wrappers to make the kernel .S file build in user-space: */
// memcpy_orig and memcpy_erms are being defined as SYM_L_LOCAL but we need it // memcpy_orig is being defined as SYM_L_LOCAL but we need it
#define SYM_FUNC_START_LOCAL(name) \ #define SYM_FUNC_START_LOCAL(name) \
SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
#define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define memcpy MEMCPY /* don't hide glibc's memcpy() */
......
...@@ -7,7 +7,3 @@ MEMSET_FN(memset_orig, ...@@ -7,7 +7,3 @@ MEMSET_FN(memset_orig,
MEMSET_FN(__memset, MEMSET_FN(__memset,
"x86-64-stosq", "x86-64-stosq",
"movsq-based memset() in arch/x86/lib/memset_64.S") "movsq-based memset() in arch/x86/lib/memset_64.S")
MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
/* SPDX-License-Identifier: GPL-2.0 */ /* SPDX-License-Identifier: GPL-2.0 */
// memset_orig and memset_erms are being defined as SYM_L_LOCAL but we need it // memset_orig is being defined as SYM_L_LOCAL but we need it
#define SYM_FUNC_START_LOCAL(name) \ #define SYM_FUNC_START_LOCAL(name) \
SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN) SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)
#define memset MEMSET /* don't hide glibc's memset() */ #define memset MEMSET /* don't hide glibc's memset() */
......
...@@ -3647,6 +3647,13 @@ static int process_stat_config_event(struct perf_session *session __maybe_unused ...@@ -3647,6 +3647,13 @@ static int process_stat_config_event(struct perf_session *session __maybe_unused
union perf_event *event) union perf_event *event)
{ {
perf_event__read_stat_config(&stat_config, &event->stat_config); perf_event__read_stat_config(&stat_config, &event->stat_config);
/*
* Aggregation modes are not used since post-processing scripts are
* supposed to take care of such requirements
*/
stat_config.aggr_mode = AGGR_NONE;
return 0; return 0;
} }
......
...@@ -667,6 +667,13 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) ...@@ -667,6 +667,13 @@ static enum counter_recovery stat_handle_error(struct evsel *counter)
evsel_list->core.threads->err_thread = -1; evsel_list->core.threads->err_thread = -1;
return COUNTER_RETRY; return COUNTER_RETRY;
} }
} else if (counter->skippable) {
if (verbose > 0)
ui__warning("skipping event %s that kernel failed to open .\n",
evsel__name(counter));
counter->supported = false;
counter->errored = true;
return COUNTER_SKIP;
} }
evsel__open_strerror(counter, &target, errno, msg, sizeof(msg)); evsel__open_strerror(counter, &target, errno, msg, sizeof(msg));
...@@ -1890,15 +1897,28 @@ static int add_default_attributes(void) ...@@ -1890,15 +1897,28 @@ static int add_default_attributes(void)
* caused by exposing latent bugs. This is fixed properly in: * caused by exposing latent bugs. This is fixed properly in:
* https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/
*/ */
if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid() && if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid()) {
metricgroup__parse_groups(evsel_list, "TopdownL1", struct evlist *metric_evlist = evlist__new();
/*metric_no_group=*/false, struct evsel *metric_evsel;
/*metric_no_merge=*/false,
/*metric_no_threshold=*/true, if (!metric_evlist)
stat_config.user_requested_cpu_list, return -1;
stat_config.system_wide,
&stat_config.metric_events) < 0) if (metricgroup__parse_groups(metric_evlist, "TopdownL1",
return -1; /*metric_no_group=*/false,
/*metric_no_merge=*/false,
/*metric_no_threshold=*/true,
stat_config.user_requested_cpu_list,
stat_config.system_wide,
&stat_config.metric_events) < 0)
return -1;
evlist__for_each_entry(metric_evlist, metric_evsel) {
metric_evsel->skippable = true;
}
evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries);
evlist__delete(metric_evlist);
}
/* Platform specific attrs */ /* Platform specific attrs */
if (evlist__add_default_attrs(evsel_list, default_null_attrs) < 0) if (evlist__add_default_attrs(evsel_list, default_null_attrs) < 0)
......
...@@ -98,6 +98,7 @@ ...@@ -98,6 +98,7 @@
"MetricGroup": "TopdownL1;tma_L1_group", "MetricGroup": "TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.1", "MetricThreshold": "tma_backend_bound > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. The rest of these subevents count backend stalls, in cycles, due to an outstanding request which is memory bound vs core bound. The subevents are not slot based events and therefore can not be precisely added or subtracted from the Backend_Bound_Aux subevents which are slot based.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -107,6 +108,7 @@ ...@@ -107,6 +108,7 @@
"MetricGroup": "TopdownL1;tma_L1_group", "MetricGroup": "TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound_aux", "MetricName": "tma_backend_bound_aux",
"MetricThreshold": "tma_backend_bound_aux > 0.2", "MetricThreshold": "tma_backend_bound_aux > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that UOPS must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count. All of these subevents count backend stalls, in slots, due to a resource limitation. These are not cycle based events and therefore can not be precisely added or subtracted from the Backend_Bound subevents which are cycle based. These subevents are supplementary to Backend_Bound and can be used to analyze results from a resource perspective at allocation.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -116,6 +118,7 @@ ...@@ -116,6 +118,7 @@
"MetricGroup": "TopdownL1;tma_L1_group", "MetricGroup": "TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear. Only issue slots wasted due to fast nukes such as memory ordering nukes are counted. Other nukes are not accounted for. Counts all issue slots blocked during this recovery window including relevant microcode flows and while uops are not yet available in the instruction queue (IQ). Also includes the issue slots that were consumed by the backend but were thrown away because they were younger than the mispredict or machine clear.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -125,6 +128,7 @@ ...@@ -125,6 +128,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_base", "MetricName": "tma_base",
"MetricThreshold": "tma_base > 0.6", "MetricThreshold": "tma_base > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -142,6 +146,7 @@ ...@@ -142,6 +146,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.05", "MetricThreshold": "tma_branch_mispredicts > 0.05",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -166,6 +171,7 @@ ...@@ -166,6 +171,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1", "MetricThreshold": "tma_core_bound > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -207,6 +213,7 @@ ...@@ -207,6 +213,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1", "MetricThreshold": "tma_fetch_bandwidth > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -215,6 +222,7 @@ ...@@ -215,6 +222,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.15", "MetricThreshold": "tma_fetch_latency > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -239,6 +247,7 @@ ...@@ -239,6 +247,7 @@
"MetricGroup": "TopdownL1;tma_L1_group", "MetricGroup": "TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.2", "MetricThreshold": "tma_frontend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -499,6 +508,7 @@ ...@@ -499,6 +508,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_bad_speculation_group",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.05", "MetricThreshold": "tma_machine_clears > 0.05",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -515,6 +525,7 @@ ...@@ -515,6 +525,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
...@@ -531,6 +542,7 @@ ...@@ -531,6 +542,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_ms_uops", "MetricName": "tma_ms_uops",
"MetricThreshold": "tma_ms_uops > 0.05", "MetricThreshold": "tma_ms_uops > 0.05",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.", "PublicDescription": "Counts the number of uops that are from the complex flows issued by the micro-sequencer (MS). This includes uops from flows due to complex instructions, faults, assists, and inserted flows.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -620,6 +632,7 @@ ...@@ -620,6 +632,7 @@
"MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group", "MetricGroup": "TopdownL2;tma_L2_group;tma_backend_bound_aux_group",
"MetricName": "tma_resource_bound", "MetricName": "tma_resource_bound",
"MetricThreshold": "tma_resource_bound > 0.2", "MetricThreshold": "tma_resource_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count.", "PublicDescription": "Counts the total number of issue slots that were not consumed by the backend due to backend stalls. Note that uops must be available for consumption in order for this event to count. If a uop is not available (IQ is empty), this event will not count.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -629,6 +642,7 @@ ...@@ -629,6 +642,7 @@
"MetricGroup": "TopdownL1;tma_L1_group", "MetricGroup": "TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.75", "MetricThreshold": "tma_retiring > 0.75",
"MetricgroupNoGroup": "TopdownL1",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
{ {
......
...@@ -103,6 +103,7 @@ ...@@ -103,6 +103,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -122,6 +124,7 @@ ...@@ -122,6 +124,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -170,6 +173,7 @@ ...@@ -170,6 +173,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -263,6 +267,7 @@ ...@@ -263,6 +267,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -272,6 +277,7 @@ ...@@ -272,6 +277,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -326,6 +332,7 @@ ...@@ -326,6 +332,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -335,6 +342,7 @@ ...@@ -335,6 +342,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -828,6 +836,7 @@ ...@@ -828,6 +836,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -858,6 +867,7 @@ ...@@ -858,6 +867,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -886,6 +896,7 @@ ...@@ -886,6 +896,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -1048,6 +1059,7 @@ ...@@ -1048,6 +1059,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -103,6 +103,7 @@ ...@@ -103,6 +103,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -122,6 +124,7 @@ ...@@ -122,6 +124,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -170,6 +173,7 @@ ...@@ -170,6 +173,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -263,6 +267,7 @@ ...@@ -263,6 +267,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -272,6 +277,7 @@ ...@@ -272,6 +277,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -326,6 +332,7 @@ ...@@ -326,6 +332,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -335,6 +342,7 @@ ...@@ -335,6 +342,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -829,6 +837,7 @@ ...@@ -829,6 +837,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -869,6 +878,7 @@ ...@@ -869,6 +878,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -897,6 +907,7 @@ ...@@ -897,6 +907,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -1079,6 +1090,7 @@ ...@@ -1079,6 +1090,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -103,6 +103,7 @@ ...@@ -103,6 +103,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -122,6 +124,7 @@ ...@@ -122,6 +124,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -161,6 +164,7 @@ ...@@ -161,6 +164,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -254,6 +258,7 @@ ...@@ -254,6 +258,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -263,6 +268,7 @@ ...@@ -263,6 +268,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -272,6 +278,7 @@ ...@@ -272,6 +278,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -281,6 +288,7 @@ ...@@ -281,6 +288,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -663,6 +671,7 @@ ...@@ -663,6 +671,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -693,6 +702,7 @@ ...@@ -693,6 +702,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -721,6 +731,7 @@ ...@@ -721,6 +731,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -874,6 +885,7 @@ ...@@ -874,6 +885,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -103,6 +103,7 @@ ...@@ -103,6 +103,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -122,6 +124,7 @@ ...@@ -122,6 +124,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -161,6 +164,7 @@ ...@@ -161,6 +164,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -254,6 +258,7 @@ ...@@ -254,6 +258,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -263,6 +268,7 @@ ...@@ -263,6 +268,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -272,6 +278,7 @@ ...@@ -272,6 +278,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -281,6 +288,7 @@ ...@@ -281,6 +288,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -664,6 +672,7 @@ ...@@ -664,6 +672,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -704,6 +713,7 @@ ...@@ -704,6 +713,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -732,6 +742,7 @@ ...@@ -732,6 +742,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -905,6 +916,7 @@ ...@@ -905,6 +916,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -103,6 +103,7 @@ ...@@ -103,6 +103,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -122,6 +124,7 @@ ...@@ -122,6 +124,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -161,6 +164,7 @@ ...@@ -161,6 +164,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -254,6 +258,7 @@ ...@@ -254,6 +258,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -263,6 +268,7 @@ ...@@ -263,6 +268,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -299,6 +305,7 @@ ...@@ -299,6 +305,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -308,6 +315,7 @@ ...@@ -308,6 +315,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -724,6 +732,7 @@ ...@@ -724,6 +732,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -754,6 +763,7 @@ ...@@ -754,6 +763,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -782,6 +792,7 @@ ...@@ -782,6 +792,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -917,6 +928,7 @@ ...@@ -917,6 +928,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -103,6 +103,7 @@ ...@@ -103,6 +103,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -112,6 +113,7 @@ ...@@ -112,6 +113,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -122,6 +124,7 @@ ...@@ -122,6 +124,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -161,6 +164,7 @@ ...@@ -161,6 +164,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -254,6 +258,7 @@ ...@@ -254,6 +258,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_info_iptb, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -263,6 +268,7 @@ ...@@ -263,6 +268,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -299,6 +305,7 @@ ...@@ -299,6 +305,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -308,6 +315,7 @@ ...@@ -308,6 +315,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -725,6 +733,7 @@ ...@@ -725,6 +733,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -765,6 +774,7 @@ ...@@ -765,6 +774,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_contested_accesses, tma_data_sharing, tma_false_sharing, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -793,6 +803,7 @@ ...@@ -793,6 +803,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -948,6 +959,7 @@ ...@@ -948,6 +959,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -76,6 +76,7 @@ ...@@ -76,6 +76,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -85,6 +86,7 @@ ...@@ -85,6 +86,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -95,6 +97,7 @@ ...@@ -95,6 +97,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -114,6 +117,7 @@ ...@@ -114,6 +117,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -160,6 +164,7 @@ ...@@ -160,6 +164,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -169,6 +174,7 @@ ...@@ -169,6 +174,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -205,6 +211,7 @@ ...@@ -205,6 +211,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -214,6 +221,7 @@ ...@@ -214,6 +221,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -412,6 +420,7 @@ ...@@ -412,6 +420,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -422,6 +431,7 @@ ...@@ -422,6 +431,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -450,6 +460,7 @@ ...@@ -450,6 +460,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -487,6 +498,7 @@ ...@@ -487,6 +498,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -76,6 +76,7 @@ ...@@ -76,6 +76,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_backend_bound", "MetricName": "tma_backend_bound",
"MetricThreshold": "tma_backend_bound > 0.2", "MetricThreshold": "tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.", "PublicDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend. Backend is the portion of the processor core where the out-of-order scheduler dispatches ready uops into their respective execution units; and once completed these uops get retired according to program order. For example; stalls due to data-cache misses or stalls due to the divider unit being overloaded are both categorized under Backend Bound. Backend Bound is further divided into two main categories: Memory Bound and Core Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -85,6 +86,7 @@ ...@@ -85,6 +86,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_bad_speculation", "MetricName": "tma_bad_speculation",
"MetricThreshold": "tma_bad_speculation > 0.15", "MetricThreshold": "tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.", "PublicDescription": "This category represents fraction of slots wasted due to incorrect speculations. This include slots used to issue uops that do not eventually get retired and slots for which the issue-pipeline was blocked due to recovery from earlier incorrect speculation. For example; wasted work due to miss-predicted branches are categorized under Bad Speculation category. Incorrect data speculation followed by Memory Ordering Nukes is another example.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -95,6 +97,7 @@ ...@@ -95,6 +97,7 @@
"MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricGroup": "BadSpec;BrMispredicts;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM",
"MetricName": "tma_branch_mispredicts", "MetricName": "tma_branch_mispredicts",
"MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction. These slots are either wasted by uops fetched from an incorrectly speculated program path; or stalls when the out-of-order part of the machine needs to recover its state from a speculative path. Sample with: BR_MISP_RETIRED.ALL_BRANCHES. Related metrics: tma_info_branch_misprediction_cost, tma_mispredicts_resteers",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -114,6 +117,7 @@ ...@@ -114,6 +117,7 @@
"MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;Compute;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_core_bound", "MetricName": "tma_core_bound",
"MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2", "MetricThreshold": "tma_core_bound > 0.1 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).", "PublicDescription": "This metric represents fraction of slots where Core non-memory issues were of a bottleneck. Shortage in hardware compute resources; or dependencies in software's instructions are both categorized under Core Bound. Hence it may indicate the machine ran out of an out-of-order resource; certain execution units are overloaded or dependencies in program's data- or instruction-flow are limiting the performance (e.g. FP-chained long-latency arithmetic operations).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -160,6 +164,7 @@ ...@@ -160,6 +164,7 @@
"MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB", "MetricGroup": "FetchBW;Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group;tma_issueFB",
"MetricName": "tma_fetch_bandwidth", "MetricName": "tma_fetch_bandwidth",
"MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35", "MetricThreshold": "tma_fetch_bandwidth > 0.1 & tma_frontend_bound > 0.15 & tma_info_ipc / 4 > 0.35",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend bandwidth issues. For example; inefficiencies at the instruction decoders; or restrictions for caching in the DSB (decoded uops cache) are categorized under Fetch Bandwidth. In such cases; the Frontend typically delivers suboptimal amount of uops to the Backend. Related metrics: tma_dsb_switches, tma_info_dsb_coverage, tma_lcp",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -169,6 +174,7 @@ ...@@ -169,6 +174,7 @@
"MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group",
"MetricName": "tma_fetch_latency", "MetricName": "tma_fetch_latency",
"MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END", "PublicDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues. For example; instruction-cache misses; iTLB misses or fetch stalls after a branch misprediction are categorized under Frontend Latency. In such cases; the Frontend eventually delivers no uops for some period. Sample with: RS_EVENTS.EMPTY_END",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -205,6 +211,7 @@ ...@@ -205,6 +211,7 @@
"MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "PGO;TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_frontend_bound", "MetricName": "tma_frontend_bound",
"MetricThreshold": "tma_frontend_bound > 0.15", "MetricThreshold": "tma_frontend_bound > 0.15",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.", "PublicDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend. Frontend denotes the first part of the processor core responsible to fetch operations that are executed later on by the Backend part. Within the Frontend; a branch predictor predicts the next address to fetch; cache-lines are fetched from the memory subsystem; parsed into instructions; and lastly decoded into micro-operations (uops). Ideally the Frontend can issue Pipeline_Width uops every cycle to the Backend. Frontend Bound denotes unutilized issue-slots when there is no Backend stall; i.e. bubbles where Frontend delivered no uops while Backend could have accepted them. For example; stalls due to instruction-cache misses would be categorized under Frontend Bound.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -214,6 +221,7 @@ ...@@ -214,6 +221,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_heavy_operations", "MetricName": "tma_heavy_operations",
"MetricThreshold": "tma_heavy_operations > 0.1", "MetricThreshold": "tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences. This highly-correlates with the uop length of these instructions/sequences.",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -411,6 +419,7 @@ ...@@ -411,6 +419,7 @@
"MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group",
"MetricName": "tma_light_operations", "MetricName": "tma_light_operations",
"MetricThreshold": "tma_light_operations > 0.6", "MetricThreshold": "tma_light_operations > 0.6",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST", "PublicDescription": "This metric represents fraction of slots where the CPU was retiring light-weight operations -- instructions that require no more than one uop (micro-operation). This correlates with total number of instructions used by the program. A uops-per-instruction (see UopPI metric) ratio of 1 or less should be expected for decently optimized software running on Intel Core/Xeon products. While this often indicates efficient X86 instructions were executed; high value does not necessarily mean better performance cannot be achieved. Sample with: INST_RETIRED.PREC_DIST",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -421,6 +430,7 @@ ...@@ -421,6 +430,7 @@
"MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn", "MetricGroup": "BadSpec;MachineClears;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueMC;tma_issueSyncxn",
"MetricName": "tma_machine_clears", "MetricName": "tma_machine_clears",
"MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15", "MetricThreshold": "tma_machine_clears > 0.1 & tma_bad_speculation > 0.15",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache", "PublicDescription": "This metric represents fraction of slots the CPU has wasted due to Machine Clears. These slots are either wasted by uops fetched prior to the clear; or stalls the out-of-order portion of the machine needs to recover its state after the clear. For example; this can happen due to memory ordering Nukes (e.g. Memory Disambiguation) or Self-Modifying-Code (SMC) nukes. Sample with: MACHINE_CLEARS.COUNT. Related metrics: tma_clears_resteers, tma_l1_bound, tma_microcode_sequencer, tma_ms_switches, tma_remote_cache",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -449,6 +459,7 @@ ...@@ -449,6 +459,7 @@
"MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group",
"MetricName": "tma_memory_bound", "MetricName": "tma_memory_bound",
"MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2",
"MetricgroupNoGroup": "TopdownL2",
"PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).", "PublicDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck. Memory Bound estimates fraction of slots where pipeline is likely stalled due to demand load or store instructions. This accounts mainly for (1) non-completed in-flight memory demand loads which coincides with execution units starvation; in addition to (2) cases where stores could impose backpressure on the pipeline when many of them get buffered at the same time (less common out of the two).",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
...@@ -486,6 +497,7 @@ ...@@ -486,6 +497,7 @@
"MetricGroup": "TmaL1;TopdownL1;tma_L1_group", "MetricGroup": "TmaL1;TopdownL1;tma_L1_group",
"MetricName": "tma_retiring", "MetricName": "tma_retiring",
"MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1",
"MetricgroupNoGroup": "TopdownL1",
"PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS", "PublicDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired. Ideally; all pipeline slots would be attributed to the Retiring category. Retiring of 100% would indicate the maximum Pipeline_Width throughput was achieved. Maximizing Retiring typically increases the Instructions-per-cycle (see IPC metric). Note that a high Retiring value does not necessary mean there is no room for more performance. For example; Heavy-operations or Microcode Assists are categorized under Retiring. They often indicate suboptimal performance and can often be optimized or avoided. Sample with: UOPS_RETIRED.RETIRE_SLOTS",
"ScaleUnit": "100%" "ScaleUnit": "100%"
}, },
......
...@@ -52,7 +52,8 @@ _json_event_attributes = [ ...@@ -52,7 +52,8 @@ _json_event_attributes = [
# Attributes that are in pmu_metric rather than pmu_event. # Attributes that are in pmu_metric rather than pmu_event.
_json_metric_attributes = [ _json_metric_attributes = [
'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', 'desc', 'metric_name', 'metric_group', 'metric_expr', 'metric_threshold', 'desc',
'long_desc', 'unit', 'compat', 'aggr_mode', 'event_grouping' 'long_desc', 'unit', 'compat', 'metricgroup_no_group', 'aggr_mode',
'event_grouping'
] ]
# Attributes that are bools or enum int values, encoded as '0', '1',... # Attributes that are bools or enum int values, encoded as '0', '1',...
_json_enum_attributes = ['aggr_mode', 'deprecated', 'event_grouping', 'perpkg'] _json_enum_attributes = ['aggr_mode', 'deprecated', 'event_grouping', 'perpkg']
...@@ -303,6 +304,7 @@ class JsonEvent: ...@@ -303,6 +304,7 @@ class JsonEvent:
self.deprecated = jd.get('Deprecated') self.deprecated = jd.get('Deprecated')
self.metric_name = jd.get('MetricName') self.metric_name = jd.get('MetricName')
self.metric_group = jd.get('MetricGroup') self.metric_group = jd.get('MetricGroup')
self.metricgroup_no_group = jd.get('MetricgroupNoGroup')
self.event_grouping = convert_metric_constraint(jd.get('MetricConstraint')) self.event_grouping = convert_metric_constraint(jd.get('MetricConstraint'))
self.metric_expr = None self.metric_expr = None
if 'MetricExpr' in jd: if 'MetricExpr' in jd:
......
...@@ -59,6 +59,7 @@ struct pmu_metric { ...@@ -59,6 +59,7 @@ struct pmu_metric {
const char *compat; const char *compat;
const char *desc; const char *desc;
const char *long_desc; const char *long_desc;
const char *metricgroup_no_group;
enum aggr_mode_class aggr_mode; enum aggr_mode_class aggr_mode;
enum metric_event_groups event_grouping; enum metric_event_groups event_grouping;
}; };
......
...@@ -152,7 +152,7 @@ def parse_version(version): ...@@ -152,7 +152,7 @@ def parse_version(version):
# - expected values assignments # - expected values assignments
class Test(object): class Test(object):
def __init__(self, path, options): def __init__(self, path, options):
parser = configparser.SafeConfigParser() parser = configparser.ConfigParser()
parser.read(path) parser.read(path)
log.warning("running '%s'" % path) log.warning("running '%s'" % path)
...@@ -247,7 +247,7 @@ class Test(object): ...@@ -247,7 +247,7 @@ class Test(object):
return True return True
def load_events(self, path, events): def load_events(self, path, events):
parser_event = configparser.SafeConfigParser() parser_event = configparser.ConfigParser()
parser_event.read(path) parser_event.read(path)
# The event record section header contains 'event' word, # The event record section header contains 'event' word,
...@@ -261,7 +261,7 @@ class Test(object): ...@@ -261,7 +261,7 @@ class Test(object):
# Read parent event if there's any # Read parent event if there's any
if (':' in section): if (':' in section):
base = section[section.index(':') + 1:] base = section[section.index(':') + 1:]
parser_base = configparser.SafeConfigParser() parser_base = configparser.ConfigParser()
parser_base.read(self.test_dir + '/' + base) parser_base.read(self.test_dir + '/' + base)
base_items = parser_base.items('event') base_items = parser_base.items('event')
......
...@@ -16,7 +16,7 @@ pinned=0 ...@@ -16,7 +16,7 @@ pinned=0
exclusive=0 exclusive=0
exclude_user=0 exclude_user=0
exclude_kernel=0|1 exclude_kernel=0|1
exclude_hv=0 exclude_hv=0|1
exclude_idle=0 exclude_idle=0
mmap=0 mmap=0
comm=0 comm=0
......
...@@ -40,7 +40,6 @@ fd=6 ...@@ -40,7 +40,6 @@ fd=6
type=0 type=0
config=7 config=7
optional=1 optional=1
# PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND # PERF_TYPE_HARDWARE / PERF_COUNT_HW_STALLED_CYCLES_BACKEND
[event7:base-stat] [event7:base-stat]
fd=7 fd=7
...@@ -89,79 +88,98 @@ enable_on_exec=0 ...@@ -89,79 +88,98 @@ enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-bad-spec (0x8100) # PERF_TYPE_RAW / topdown-fe-bound (0x8200)
[event13:base-stat] [event13:base-stat]
fd=13 fd=13
group_fd=11 group_fd=11
type=4 type=4
config=33024 config=33280
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fe-bound (0x8200) # PERF_TYPE_RAW / topdown-be-bound (0x8300)
[event14:base-stat] [event14:base-stat]
fd=14 fd=14
group_fd=11 group_fd=11
type=4 type=4
config=33280 config=33536
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-be-bound (0x8300) # PERF_TYPE_RAW / topdown-bad-spec (0x8100)
[event15:base-stat] [event15:base-stat]
fd=15 fd=15
group_fd=11 group_fd=11
type=4 type=4
config=33536 config=33024
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
[event16:base-stat] [event16:base-stat]
fd=16 fd=16
group_fd=11
type=4 type=4
config=33792 config=4109
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
[event17:base-stat] [event17:base-stat]
fd=17 fd=17
group_fd=11
type=4 type=4
config=34048 config=17039629
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
[event18:base-stat] [event18:base-stat]
fd=18 fd=18
group_fd=11
type=4 type=4
config=34304 config=60
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-mem-bound (0x8700) # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
[event19:base-stat] [event19:base-stat]
fd=19 fd=19
group_fd=11
type=4 type=4
config=34560 config=2097421
disabled=0 optional=1
enable_on_exec=0
read_format=15 # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
[event20:base-stat]
fd=20
type=4
config=316
optional=1
# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
[event21:base-stat]
fd=21
type=4
config=412
optional=1
# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
[event22:base-stat]
fd=22
type=4
config=572
optional=1
# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
[event23:base-stat]
fd=23
type=4
config=706
optional=1
# PERF_TYPE_RAW / UOPS_ISSUED.ANY
[event24:base-stat]
fd=24
type=4
config=270
optional=1 optional=1
...@@ -90,89 +90,108 @@ enable_on_exec=0 ...@@ -90,89 +90,108 @@ enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-bad-spec (0x8100) # PERF_TYPE_RAW / topdown-fe-bound (0x8200)
[event13:base-stat] [event13:base-stat]
fd=13 fd=13
group_fd=11 group_fd=11
type=4 type=4
config=33024 config=33280
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fe-bound (0x8200) # PERF_TYPE_RAW / topdown-be-bound (0x8300)
[event14:base-stat] [event14:base-stat]
fd=14 fd=14
group_fd=11 group_fd=11
type=4 type=4
config=33280 config=33536
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-be-bound (0x8300) # PERF_TYPE_RAW / topdown-bad-spec (0x8100)
[event15:base-stat] [event15:base-stat]
fd=15 fd=15
group_fd=11 group_fd=11
type=4 type=4
config=33536 config=33024
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
[event16:base-stat] [event16:base-stat]
fd=16 fd=16
group_fd=11
type=4 type=4
config=33792 config=4109
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
[event17:base-stat] [event17:base-stat]
fd=17 fd=17
group_fd=11
type=4 type=4
config=34048 config=17039629
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
[event18:base-stat] [event18:base-stat]
fd=18 fd=18
group_fd=11
type=4 type=4
config=34304 config=60
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-mem-bound (0x8700) # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
[event19:base-stat] [event19:base-stat]
fd=19 fd=19
group_fd=11
type=4 type=4
config=34560 config=2097421
disabled=0 optional=1
enable_on_exec=0
read_format=15 # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
[event20:base-stat]
fd=20
type=4
config=316
optional=1
# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
[event21:base-stat]
fd=21
type=4
config=412
optional=1
# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
[event22:base-stat]
fd=22
type=4
config=572
optional=1
# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
[event23:base-stat]
fd=23
type=4
config=706
optional=1
# PERF_TYPE_RAW / UOPS_ISSUED.ANY
[event24:base-stat]
fd=24
type=4
config=270
optional=1 optional=1
# PERF_TYPE_HW_CACHE / # PERF_TYPE_HW_CACHE /
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event20:base-stat] [event25:base-stat]
fd=20 fd=25
type=3 type=3
config=0 config=0
optional=1 optional=1
...@@ -181,8 +200,8 @@ optional=1 ...@@ -181,8 +200,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event21:base-stat] [event26:base-stat]
fd=21 fd=26
type=3 type=3
config=65536 config=65536
optional=1 optional=1
...@@ -191,8 +210,8 @@ optional=1 ...@@ -191,8 +210,8 @@ optional=1
# PERF_COUNT_HW_CACHE_LL << 0 | # PERF_COUNT_HW_CACHE_LL << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event22:base-stat] [event27:base-stat]
fd=22 fd=27
type=3 type=3
config=2 config=2
optional=1 optional=1
...@@ -201,8 +220,8 @@ optional=1 ...@@ -201,8 +220,8 @@ optional=1
# PERF_COUNT_HW_CACHE_LL << 0 | # PERF_COUNT_HW_CACHE_LL << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event23:base-stat] [event28:base-stat]
fd=23 fd=28
type=3 type=3
config=65538 config=65538
optional=1 optional=1
...@@ -90,89 +90,108 @@ enable_on_exec=0 ...@@ -90,89 +90,108 @@ enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-bad-spec (0x8100) # PERF_TYPE_RAW / topdown-fe-bound (0x8200)
[event13:base-stat] [event13:base-stat]
fd=13 fd=13
group_fd=11 group_fd=11
type=4 type=4
config=33024 config=33280
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fe-bound (0x8200) # PERF_TYPE_RAW / topdown-be-bound (0x8300)
[event14:base-stat] [event14:base-stat]
fd=14 fd=14
group_fd=11 group_fd=11
type=4 type=4
config=33280 config=33536
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-be-bound (0x8300) # PERF_TYPE_RAW / topdown-bad-spec (0x8100)
[event15:base-stat] [event15:base-stat]
fd=15 fd=15
group_fd=11 group_fd=11
type=4 type=4
config=33536 config=33024
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
[event16:base-stat] [event16:base-stat]
fd=16 fd=16
group_fd=11
type=4 type=4
config=33792 config=4109
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
[event17:base-stat] [event17:base-stat]
fd=17 fd=17
group_fd=11
type=4 type=4
config=34048 config=17039629
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
[event18:base-stat] [event18:base-stat]
fd=18 fd=18
group_fd=11
type=4 type=4
config=34304 config=60
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-mem-bound (0x8700) # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
[event19:base-stat] [event19:base-stat]
fd=19 fd=19
group_fd=11
type=4 type=4
config=34560 config=2097421
disabled=0 optional=1
enable_on_exec=0
read_format=15 # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
[event20:base-stat]
fd=20
type=4
config=316
optional=1
# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
[event21:base-stat]
fd=21
type=4
config=412
optional=1
# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
[event22:base-stat]
fd=22
type=4
config=572
optional=1
# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
[event23:base-stat]
fd=23
type=4
config=706
optional=1
# PERF_TYPE_RAW / UOPS_ISSUED.ANY
[event24:base-stat]
fd=24
type=4
config=270
optional=1 optional=1
# PERF_TYPE_HW_CACHE / # PERF_TYPE_HW_CACHE /
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event20:base-stat] [event25:base-stat]
fd=20 fd=25
type=3 type=3
config=0 config=0
optional=1 optional=1
...@@ -181,8 +200,8 @@ optional=1 ...@@ -181,8 +200,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event21:base-stat] [event26:base-stat]
fd=21 fd=26
type=3 type=3
config=65536 config=65536
optional=1 optional=1
...@@ -191,8 +210,8 @@ optional=1 ...@@ -191,8 +210,8 @@ optional=1
# PERF_COUNT_HW_CACHE_LL << 0 | # PERF_COUNT_HW_CACHE_LL << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event22:base-stat] [event27:base-stat]
fd=22 fd=27
type=3 type=3
config=2 config=2
optional=1 optional=1
...@@ -201,8 +220,8 @@ optional=1 ...@@ -201,8 +220,8 @@ optional=1
# PERF_COUNT_HW_CACHE_LL << 0 | # PERF_COUNT_HW_CACHE_LL << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event23:base-stat] [event28:base-stat]
fd=23 fd=28
type=3 type=3
config=65538 config=65538
optional=1 optional=1
...@@ -211,8 +230,8 @@ optional=1 ...@@ -211,8 +230,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1I << 0 | # PERF_COUNT_HW_CACHE_L1I << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event24:base-stat] [event29:base-stat]
fd=24 fd=29
type=3 type=3
config=1 config=1
optional=1 optional=1
...@@ -221,8 +240,8 @@ optional=1 ...@@ -221,8 +240,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1I << 0 | # PERF_COUNT_HW_CACHE_L1I << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event25:base-stat] [event30:base-stat]
fd=25 fd=30
type=3 type=3
config=65537 config=65537
optional=1 optional=1
...@@ -231,8 +250,8 @@ optional=1 ...@@ -231,8 +250,8 @@ optional=1
# PERF_COUNT_HW_CACHE_DTLB << 0 | # PERF_COUNT_HW_CACHE_DTLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event26:base-stat] [event31:base-stat]
fd=26 fd=31
type=3 type=3
config=3 config=3
optional=1 optional=1
...@@ -241,8 +260,8 @@ optional=1 ...@@ -241,8 +260,8 @@ optional=1
# PERF_COUNT_HW_CACHE_DTLB << 0 | # PERF_COUNT_HW_CACHE_DTLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event27:base-stat] [event32:base-stat]
fd=27 fd=32
type=3 type=3
config=65539 config=65539
optional=1 optional=1
...@@ -251,8 +270,8 @@ optional=1 ...@@ -251,8 +270,8 @@ optional=1
# PERF_COUNT_HW_CACHE_ITLB << 0 | # PERF_COUNT_HW_CACHE_ITLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event28:base-stat] [event33:base-stat]
fd=28 fd=33
type=3 type=3
config=4 config=4
optional=1 optional=1
...@@ -261,8 +280,8 @@ optional=1 ...@@ -261,8 +280,8 @@ optional=1
# PERF_COUNT_HW_CACHE_ITLB << 0 | # PERF_COUNT_HW_CACHE_ITLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event29:base-stat] [event34:base-stat]
fd=29 fd=34
type=3 type=3
config=65540 config=65540
optional=1 optional=1
...@@ -90,89 +90,108 @@ enable_on_exec=0 ...@@ -90,89 +90,108 @@ enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-bad-spec (0x8100) # PERF_TYPE_RAW / topdown-fe-bound (0x8200)
[event13:base-stat] [event13:base-stat]
fd=13 fd=13
group_fd=11 group_fd=11
type=4 type=4
config=33024 config=33280
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fe-bound (0x8200) # PERF_TYPE_RAW / topdown-be-bound (0x8300)
[event14:base-stat] [event14:base-stat]
fd=14 fd=14
group_fd=11 group_fd=11
type=4 type=4
config=33280 config=33536
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-be-bound (0x8300) # PERF_TYPE_RAW / topdown-bad-spec (0x8100)
[event15:base-stat] [event15:base-stat]
fd=15 fd=15
group_fd=11 group_fd=11
type=4 type=4
config=33536 config=33024
disabled=0 disabled=0
enable_on_exec=0 enable_on_exec=0
read_format=15 read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-heavy-ops (0x8400) # PERF_TYPE_RAW / INT_MISC.UOP_DROPPING
[event16:base-stat] [event16:base-stat]
fd=16 fd=16
group_fd=11
type=4 type=4
config=33792 config=4109
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-br-mispredict (0x8500) # PERF_TYPE_RAW / cpu/INT_MISC.RECOVERY_CYCLES,cmask=1,edge/
[event17:base-stat] [event17:base-stat]
fd=17 fd=17
group_fd=11
type=4 type=4
config=34048 config=17039629
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-fetch-lat (0x8600) # PERF_TYPE_RAW / CPU_CLK_UNHALTED.THREAD
[event18:base-stat] [event18:base-stat]
fd=18 fd=18
group_fd=11
type=4 type=4
config=34304 config=60
disabled=0
enable_on_exec=0
read_format=15
optional=1 optional=1
# PERF_TYPE_RAW / topdown-mem-bound (0x8700) # PERF_TYPE_RAW / INT_MISC.RECOVERY_CYCLES_ANY
[event19:base-stat] [event19:base-stat]
fd=19 fd=19
group_fd=11
type=4 type=4
config=34560 config=2097421
disabled=0 optional=1
enable_on_exec=0
read_format=15 # PERF_TYPE_RAW / CPU_CLK_UNHALTED.REF_XCLK
[event20:base-stat]
fd=20
type=4
config=316
optional=1
# PERF_TYPE_RAW / IDQ_UOPS_NOT_DELIVERED.CORE
[event21:base-stat]
fd=21
type=4
config=412
optional=1
# PERF_TYPE_RAW / CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE
[event22:base-stat]
fd=22
type=4
config=572
optional=1
# PERF_TYPE_RAW / UOPS_RETIRED.RETIRE_SLOTS
[event23:base-stat]
fd=23
type=4
config=706
optional=1
# PERF_TYPE_RAW / UOPS_ISSUED.ANY
[event24:base-stat]
fd=24
type=4
config=270
optional=1 optional=1
# PERF_TYPE_HW_CACHE / # PERF_TYPE_HW_CACHE /
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event20:base-stat] [event25:base-stat]
fd=20 fd=25
type=3 type=3
config=0 config=0
optional=1 optional=1
...@@ -181,8 +200,8 @@ optional=1 ...@@ -181,8 +200,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event21:base-stat] [event26:base-stat]
fd=21 fd=26
type=3 type=3
config=65536 config=65536
optional=1 optional=1
...@@ -191,8 +210,8 @@ optional=1 ...@@ -191,8 +210,8 @@ optional=1
# PERF_COUNT_HW_CACHE_LL << 0 | # PERF_COUNT_HW_CACHE_LL << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event22:base-stat] [event27:base-stat]
fd=22 fd=27
type=3 type=3
config=2 config=2
optional=1 optional=1
...@@ -201,8 +220,8 @@ optional=1 ...@@ -201,8 +220,8 @@ optional=1
# PERF_COUNT_HW_CACHE_LL << 0 | # PERF_COUNT_HW_CACHE_LL << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event23:base-stat] [event28:base-stat]
fd=23 fd=28
type=3 type=3
config=65538 config=65538
optional=1 optional=1
...@@ -211,8 +230,8 @@ optional=1 ...@@ -211,8 +230,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1I << 0 | # PERF_COUNT_HW_CACHE_L1I << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event24:base-stat] [event29:base-stat]
fd=24 fd=29
type=3 type=3
config=1 config=1
optional=1 optional=1
...@@ -221,8 +240,8 @@ optional=1 ...@@ -221,8 +240,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1I << 0 | # PERF_COUNT_HW_CACHE_L1I << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event25:base-stat] [event30:base-stat]
fd=25 fd=30
type=3 type=3
config=65537 config=65537
optional=1 optional=1
...@@ -231,8 +250,8 @@ optional=1 ...@@ -231,8 +250,8 @@ optional=1
# PERF_COUNT_HW_CACHE_DTLB << 0 | # PERF_COUNT_HW_CACHE_DTLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event26:base-stat] [event31:base-stat]
fd=26 fd=31
type=3 type=3
config=3 config=3
optional=1 optional=1
...@@ -241,8 +260,8 @@ optional=1 ...@@ -241,8 +260,8 @@ optional=1
# PERF_COUNT_HW_CACHE_DTLB << 0 | # PERF_COUNT_HW_CACHE_DTLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event27:base-stat] [event32:base-stat]
fd=27 fd=32
type=3 type=3
config=65539 config=65539
optional=1 optional=1
...@@ -251,8 +270,8 @@ optional=1 ...@@ -251,8 +270,8 @@ optional=1
# PERF_COUNT_HW_CACHE_ITLB << 0 | # PERF_COUNT_HW_CACHE_ITLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event28:base-stat] [event33:base-stat]
fd=28 fd=33
type=3 type=3
config=4 config=4
optional=1 optional=1
...@@ -261,8 +280,8 @@ optional=1 ...@@ -261,8 +280,8 @@ optional=1
# PERF_COUNT_HW_CACHE_ITLB << 0 | # PERF_COUNT_HW_CACHE_ITLB << 0 |
# (PERF_COUNT_HW_CACHE_OP_READ << 8) | # (PERF_COUNT_HW_CACHE_OP_READ << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event29:base-stat] [event34:base-stat]
fd=29 fd=34
type=3 type=3
config=65540 config=65540
optional=1 optional=1
...@@ -271,8 +290,8 @@ optional=1 ...@@ -271,8 +290,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) # (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16)
[event30:base-stat] [event35:base-stat]
fd=30 fd=35
type=3 type=3
config=512 config=512
optional=1 optional=1
...@@ -281,8 +300,8 @@ optional=1 ...@@ -281,8 +300,8 @@ optional=1
# PERF_COUNT_HW_CACHE_L1D << 0 | # PERF_COUNT_HW_CACHE_L1D << 0 |
# (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) | # (PERF_COUNT_HW_CACHE_OP_PREFETCH << 8) |
# (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) # (PERF_COUNT_HW_CACHE_RESULT_MISS << 16)
[event31:base-stat] [event36:base-stat]
fd=31 fd=36
type=3 type=3
config=66048 config=66048
optional=1 optional=1
...@@ -120,7 +120,8 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u ...@@ -120,7 +120,8 @@ static int test__expr(struct test_suite *t __maybe_unused, int subtest __maybe_u
p = "FOO/0"; p = "FOO/0";
ret = expr__parse(&val, ctx, p); ret = expr__parse(&val, ctx, p);
TEST_ASSERT_VAL("division by zero", ret == -1); TEST_ASSERT_VAL("division by zero", ret == 0);
TEST_ASSERT_VAL("division by zero", isnan(val));
p = "BAR/"; p = "BAR/";
ret = expr__parse(&val, ctx, p); ret = expr__parse(&val, ctx, p);
......
...@@ -38,6 +38,7 @@ static void load_runtime_stat(struct evlist *evlist, struct value *vals) ...@@ -38,6 +38,7 @@ static void load_runtime_stat(struct evlist *evlist, struct value *vals)
evlist__alloc_aggr_stats(evlist, 1); evlist__alloc_aggr_stats(evlist, 1);
evlist__for_each_entry(evlist, evsel) { evlist__for_each_entry(evlist, evsel) {
count = find_value(evsel->name, vals); count = find_value(evsel->name, vals);
evsel->supported = true;
evsel->stats->aggr->counts.val = count; evsel->stats->aggr->counts.val = count;
if (evsel__name_is(evsel, "duration_time")) if (evsel__name_is(evsel, "duration_time"))
update_stats(&walltime_nsecs_stats, count); update_stats(&walltime_nsecs_stats, count);
......
...@@ -28,6 +28,18 @@ test_stat_record_report() { ...@@ -28,6 +28,18 @@ test_stat_record_report() {
echo "stat record and report test [Success]" echo "stat record and report test [Success]"
} }
test_stat_record_script() {
echo "stat record and script test"
if ! perf stat record -o - true | perf script -i - 2>&1 | \
grep -E -q "CPU[[:space:]]+THREAD[[:space:]]+VAL[[:space:]]+ENA[[:space:]]+RUN[[:space:]]+TIME[[:space:]]+EVENT"
then
echo "stat record and script test [Failed]"
err=1
return
fi
echo "stat record and script test [Success]"
}
test_stat_repeat_weak_groups() { test_stat_repeat_weak_groups() {
echo "stat repeat weak groups test" echo "stat repeat weak groups test"
if ! perf stat -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}' \ if ! perf stat -e '{cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles,cycles}' \
...@@ -93,6 +105,7 @@ test_topdown_weak_groups() { ...@@ -93,6 +105,7 @@ test_topdown_weak_groups() {
test_default_stat test_default_stat
test_stat_record_report test_stat_record_report
test_stat_record_script
test_stat_repeat_weak_groups test_stat_repeat_weak_groups
test_topdown_groups test_topdown_groups
test_topdown_weak_groups test_topdown_weak_groups
......
...@@ -506,6 +506,13 @@ test_sample() ...@@ -506,6 +506,13 @@ test_sample()
echo "perf record failed with --aux-sample" echo "perf record failed with --aux-sample"
return 1 return 1
fi fi
# Check with event with PMU name
if perf_record_no_decode -o "${perfdatafile}" -e br_misp_retired.all_branches:u uname ; then
if ! perf_record_no_decode -o "${perfdatafile}" -e '{intel_pt//,br_misp_retired.all_branches/aux-sample-size=8192/}:u' uname ; then
echo "perf record failed with --aux-sample-size"
return 1
fi
fi
echo OK echo OK
return 0 return 0
} }
......
...@@ -56,7 +56,7 @@ if [ $? -ne 0 ]; then ...@@ -56,7 +56,7 @@ if [ $? -ne 0 ]; then
exit 1 exit 1
fi fi
if ! perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then if ! DEBUGINFOD_URLS='' perf inject -i $PERF_DATA -o $PERF_INJ_DATA -j; then
echo "Fail to inject samples" echo "Fail to inject samples"
exit 1 exit 1
fi fi
......
...@@ -12,10 +12,12 @@ ...@@ -12,10 +12,12 @@
static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_1, "ARCH_", x86_arch_prctl_codes_1_offset); static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_1, "ARCH_", x86_arch_prctl_codes_1_offset);
static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_2, "ARCH_", x86_arch_prctl_codes_2_offset); static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_2, "ARCH_", x86_arch_prctl_codes_2_offset);
static DEFINE_STRARRAY_OFFSET(x86_arch_prctl_codes_3, "ARCH_", x86_arch_prctl_codes_3_offset);
static struct strarray *x86_arch_prctl_codes[] = { static struct strarray *x86_arch_prctl_codes[] = {
&strarray__x86_arch_prctl_codes_1, &strarray__x86_arch_prctl_codes_1,
&strarray__x86_arch_prctl_codes_2, &strarray__x86_arch_prctl_codes_2,
&strarray__x86_arch_prctl_codes_3,
}; };
static DEFINE_STRARRAYS(x86_arch_prctl_codes); static DEFINE_STRARRAYS(x86_arch_prctl_codes);
......
...@@ -24,3 +24,4 @@ print_range () { ...@@ -24,3 +24,4 @@ print_range () {
print_range 1 0x1 0x1001 print_range 1 0x1 0x1001
print_range 2 0x2 0x2001 print_range 2 0x2 0x2001
print_range 3 0x4 0x4001
...@@ -416,6 +416,8 @@ int contention_end(u64 *ctx) ...@@ -416,6 +416,8 @@ int contention_end(u64 *ctx)
return 0; return 0;
} }
struct rq {};
extern struct rq runqueues __ksym; extern struct rq runqueues __ksym;
struct rq___old { struct rq___old {
......
#ifndef __VMLINUX_H #ifndef __VMLINUX_H
#define __VMLINUX_H #define __VMLINUX_H
#include <linux/stddef.h> // for define __always_inline
#include <linux/bpf.h> #include <linux/bpf.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/perf_event.h> #include <linux/perf_event.h>
......
...@@ -290,6 +290,7 @@ void evsel__init(struct evsel *evsel, ...@@ -290,6 +290,7 @@ void evsel__init(struct evsel *evsel,
evsel->per_pkg_mask = NULL; evsel->per_pkg_mask = NULL;
evsel->collect_stat = false; evsel->collect_stat = false;
evsel->pmu_name = NULL; evsel->pmu_name = NULL;
evsel->skippable = false;
} }
struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx) struct evsel *evsel__new_idx(struct perf_event_attr *attr, int idx)
...@@ -828,26 +829,26 @@ bool evsel__name_is(struct evsel *evsel, const char *name) ...@@ -828,26 +829,26 @@ bool evsel__name_is(struct evsel *evsel, const char *name)
const char *evsel__group_pmu_name(const struct evsel *evsel) const char *evsel__group_pmu_name(const struct evsel *evsel)
{ {
const struct evsel *leader; struct evsel *leader = evsel__leader(evsel);
struct evsel *pos;
/* If the pmu_name is set use it. pmu_name isn't set for CPU and software events. */
if (evsel->pmu_name)
return evsel->pmu_name;
/* /*
* Software events may be in a group with other uncore PMU events. Use * Software events may be in a group with other uncore PMU events. Use
* the pmu_name of the group leader to avoid breaking the software event * the pmu_name of the first non-software event to avoid breaking the
* out of the group. * software event out of the group.
* *
* Aux event leaders, like intel_pt, expect a group with events from * Aux event leaders, like intel_pt, expect a group with events from
* other PMUs, so substitute the AUX event's PMU in this case. * other PMUs, so substitute the AUX event's PMU in this case.
*/ */
leader = evsel__leader(evsel); if (evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) {
if ((evsel->core.attr.type == PERF_TYPE_SOFTWARE || evsel__is_aux_event(leader)) && /* Starting with the leader, find the first event with a named PMU. */
leader->pmu_name) { for_each_group_evsel(pos, leader) {
return leader->pmu_name; if (pos->pmu_name)
return pos->pmu_name;
}
} }
return "cpu"; return evsel->pmu_name ?: "cpu";
} }
const char *evsel__metric_id(const struct evsel *evsel) const char *evsel__metric_id(const struct evsel *evsel)
...@@ -1725,9 +1726,13 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread) ...@@ -1725,9 +1726,13 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread)
return -1; return -1;
fd = FD(leader, cpu_map_idx, thread); fd = FD(leader, cpu_map_idx, thread);
BUG_ON(fd == -1); BUG_ON(fd == -1 && !leader->skippable);
return fd; /*
* When the leader has been skipped, return -2 to distinguish from no
* group leader case.
*/
return fd == -1 ? -2 : fd;
} }
static void evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int thread_idx) static void evsel__remove_fd(struct evsel *pos, int nr_cpus, int nr_threads, int thread_idx)
...@@ -2109,6 +2114,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, ...@@ -2109,6 +2114,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus,
group_fd = get_group_fd(evsel, idx, thread); group_fd = get_group_fd(evsel, idx, thread);
if (group_fd == -2) {
pr_debug("broken group leader for %s\n", evsel->name);
err = -EINVAL;
goto out_close;
}
test_attr__ready(); test_attr__ready();
/* Debug message used by test scripts */ /* Debug message used by test scripts */
......
...@@ -95,6 +95,7 @@ struct evsel { ...@@ -95,6 +95,7 @@ struct evsel {
bool weak_group; bool weak_group;
bool bpf_counter; bool bpf_counter;
bool use_config_name; bool use_config_name;
bool skippable;
int bpf_fd; int bpf_fd;
struct bpf_object *bpf_obj; struct bpf_object *bpf_obj;
struct list_head config_terms; struct list_head config_terms;
......
...@@ -225,7 +225,11 @@ expr: NUMBER ...@@ -225,7 +225,11 @@ expr: NUMBER
{ {
if (fpclassify($3.val) == FP_ZERO) { if (fpclassify($3.val) == FP_ZERO) {
pr_debug("division by zero\n"); pr_debug("division by zero\n");
YYABORT; assert($3.ids == NULL);
if (compute_ids)
ids__free($1.ids);
$$.val = NAN;
$$.ids = NULL;
} else if (!compute_ids || (is_const($1.val) && is_const($3.val))) { } else if (!compute_ids || (is_const($1.val) && is_const($3.val))) {
assert($1.ids == NULL); assert($1.ids == NULL);
assert($3.ids == NULL); assert($3.ids == NULL);
......
...@@ -1144,12 +1144,12 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm, ...@@ -1144,12 +1144,12 @@ static int metricgroup__add_metric_callback(const struct pmu_metric *pm,
struct metricgroup__add_metric_data *data = vdata; struct metricgroup__add_metric_data *data = vdata;
int ret = 0; int ret = 0;
if (pm->metric_expr && if (pm->metric_expr && match_pm_metric(pm, data->metric_name)) {
(match_metric(pm->metric_group, data->metric_name) || bool metric_no_group = data->metric_no_group ||
match_metric(pm->metric_name, data->metric_name))) { match_metric(data->metric_name, pm->metricgroup_no_group);
data->has_match = true; data->has_match = true;
ret = add_metric(data->list, pm, data->modifier, data->metric_no_group, ret = add_metric(data->list, pm, data->modifier, metric_no_group,
data->metric_no_threshold, data->user_requested_cpu_list, data->metric_no_threshold, data->user_requested_cpu_list,
data->system_wide, /*root_metric=*/NULL, data->system_wide, /*root_metric=*/NULL,
/*visited_metrics=*/NULL, table); /*visited_metrics=*/NULL, table);
...@@ -1672,7 +1672,7 @@ static int metricgroup__topdown_max_level_callback(const struct pmu_metric *pm, ...@@ -1672,7 +1672,7 @@ static int metricgroup__topdown_max_level_callback(const struct pmu_metric *pm,
{ {
unsigned int *max_level = data; unsigned int *max_level = data;
unsigned int level; unsigned int level;
const char *p = strstr(pm->metric_group, "TopdownL"); const char *p = strstr(pm->metric_group ?: "", "TopdownL");
if (!p || p[8] == '\0') if (!p || p[8] == '\0')
return 0; return 0;
......
...@@ -2140,25 +2140,32 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list ...@@ -2140,25 +2140,32 @@ static int evlist__cmp(void *state, const struct list_head *l, const struct list
int *leader_idx = state; int *leader_idx = state;
int lhs_leader_idx = *leader_idx, rhs_leader_idx = *leader_idx, ret; int lhs_leader_idx = *leader_idx, rhs_leader_idx = *leader_idx, ret;
const char *lhs_pmu_name, *rhs_pmu_name; const char *lhs_pmu_name, *rhs_pmu_name;
bool lhs_has_group = false, rhs_has_group = false;
/* /*
* First sort by grouping/leader. Read the leader idx only if the evsel * First sort by grouping/leader. Read the leader idx only if the evsel
* is part of a group, as -1 indicates no group. * is part of a group, as -1 indicates no group.
*/ */
if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) if (lhs_core->leader != lhs_core || lhs_core->nr_members > 1) {
lhs_has_group = true;
lhs_leader_idx = lhs_core->leader->idx; lhs_leader_idx = lhs_core->leader->idx;
if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1) }
if (rhs_core->leader != rhs_core || rhs_core->nr_members > 1) {
rhs_has_group = true;
rhs_leader_idx = rhs_core->leader->idx; rhs_leader_idx = rhs_core->leader->idx;
}
if (lhs_leader_idx != rhs_leader_idx) if (lhs_leader_idx != rhs_leader_idx)
return lhs_leader_idx - rhs_leader_idx; return lhs_leader_idx - rhs_leader_idx;
/* Group by PMU. Groups can't span PMUs. */ /* Group by PMU if there is a group. Groups can't span PMUs. */
lhs_pmu_name = evsel__group_pmu_name(lhs); if (lhs_has_group && rhs_has_group) {
rhs_pmu_name = evsel__group_pmu_name(rhs); lhs_pmu_name = evsel__group_pmu_name(lhs);
ret = strcmp(lhs_pmu_name, rhs_pmu_name); rhs_pmu_name = evsel__group_pmu_name(rhs);
if (ret) ret = strcmp(lhs_pmu_name, rhs_pmu_name);
return ret; if (ret)
return ret;
}
/* Architecture specific sorting. */ /* Architecture specific sorting. */
return arch_evlist__cmp(lhs, rhs); return arch_evlist__cmp(lhs, rhs);
......
...@@ -431,7 +431,7 @@ static void print_metric_json(struct perf_stat_config *config __maybe_unused, ...@@ -431,7 +431,7 @@ static void print_metric_json(struct perf_stat_config *config __maybe_unused,
struct outstate *os = ctx; struct outstate *os = ctx;
FILE *out = os->fh; FILE *out = os->fh;
fprintf(out, "\"metric-value\" : %f, ", val); fprintf(out, "\"metric-value\" : \"%f\", ", val);
fprintf(out, "\"metric-unit\" : \"%s\"", unit); fprintf(out, "\"metric-unit\" : \"%s\"", unit);
if (!config->metric_only) if (!config->metric_only)
fprintf(out, "}"); fprintf(out, "}");
......
...@@ -403,12 +403,25 @@ static int prepare_metric(struct evsel **metric_events, ...@@ -403,12 +403,25 @@ static int prepare_metric(struct evsel **metric_events,
if (!aggr) if (!aggr)
break; break;
/* if (!metric_events[i]->supported) {
* If an event was scaled during stat gathering, reverse /*
* the scale before computing the metric. * Not supported events will have a count of 0,
*/ * which can be confusing in a
val = aggr->counts.val * (1.0 / metric_events[i]->scale); * metric. Explicitly set the value to NAN. Not
source_count = evsel__source_count(metric_events[i]); * counted events (enable time of 0) are read as
* 0.
*/
val = NAN;
source_count = 0;
} else {
/*
* If an event was scaled during stat gathering,
* reverse the scale before computing the
* metric.
*/
val = aggr->counts.val * (1.0 / metric_events[i]->scale);
source_count = evsel__source_count(metric_events[i]);
}
} }
n = strdup(evsel__metric_id(metric_events[i])); n = strdup(evsel__metric_id(metric_events[i]));
if (!n) if (!n)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment