Commit 19f81df2 authored by Robert Bragg's avatar Robert Bragg Committed by Ben Widawsky

drm/i915/perf: Add OA unit support for Gen 8+

Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all
share (more-or-less) the same OA unit design.

Of particular note in comparison to Haswell: some OA unit HW config
state has become per-context state and as a consequence it is somewhat
more complicated to manage synchronous state changes from the cpu while
there's no guarantee of what context (if any) is currently actively
running on the gpu.

The periodic sampling frequency which can be particularly useful for
system-wide analysis (as opposed to command stream synchronised
MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to
have become per-context save and restored (while the OABUFFER
destination is still a shared, system-wide resource).

This support for gen8+ takes care to consider a number of timing
challenges involved in synchronously updating per-context state
primarily by programming all config state from the cpu and updating all
current and saved contexts synchronously while the OA unit is still
disabled.

The driver intentionally avoids depending on command streamer
programming to update OA state considering the lack of synchronization
between the automatic loading of OACTXCONTROL state (that includes the
periodic sampling state and enable state) on context restore and the
parsing of any general purpose BB the driver can control. I.e. this
implementation is careful to avoid the possibility of a context restore
temporarily enabling any out-of-date periodic sampling state. In
addition to the risk of transiently-out-of-date state being loaded
automatically; there are also internal HW latencies involved in the
loading of MUX configurations which would be difficult to account for
from the command streamer (and we only want to enable the unit when once
the MUX configuration is complete).

Since the Gen8+ OA unit design no longer supports clock gating the unit
off for a single given context (which effectively stopped any progress
of counters while any other context was running) and instead supports
tagging OA reports with a context ID for filtering on the CPU, it means
we can no longer hide the system-wide progress of counters from a
non-privileged application only interested in metrics for its own
context. Although we could theoretically try and subtract the progress
of other contexts before forwarding reports via read() we aren't in a
position to filter reports captured via MI_REPORT_PERF_COUNT commands.
As a result, for Gen8+, we always require the
dev.i915.perf_stream_paranoid to be unset for any access to OA metrics
if not root.

v5: Drain submitted requests when enabling metric set to ensure no
    lite-restore erases the context image we just updated (Lionel)

v6: In addition to drain, switch to kernel context & update all
    context in place (Chris)

v7: Add missing mutex_unlock() if switching to kernel context fails
    (Matthew)

v8: Simplify OA period/flex-eu-counters programming by using the
    batchbuffer instead of modifying ctx-image (Lionel)

v9: Back to updating the context image (due to erroneous testing,
    batchbuffer programming the OA unit doesn't actually work)
    (Lionel)
    Pin context before updating context image (Chris)
    Drop MMIO programming now that we switch to a kernel context with
    right values in initial context image (Chris)

v10: Just pin_map the contexts we want to modify or let the
     configuration happen on first use (Chris)

v11: Update kernel context OA config through the batchbuffer rather
     than on the fly ctx-image update (Lionel)

v12: Rework OA context registers update again by swithing away from
     user contexts and reconfiguring the kernel context through the
     batchbuffer and updating all the other contexts' context image.
     Also take care to lock slice/subslice configuration when OA is
     on. (Lionel)

v13: Request rpcs updates on all engine when updating the OA config
     (Lionel)

v14: Drop any kind of rpcs management now that we monitor sseu
     configuration changes in a later patch (Lionel)
     Remove usleep after programming the NOA configs on Gen8+, this
     doesn't seem to be needed (Lionel)

v15: Respect coding style for block comments (Chris)

v16: Add missing i915_add_request() in case we fail to emit OA
     configuration (Matthew)
Signed-off-by: default avatarRobert Bragg <robert@sixbynine.org>
Signed-off-by: default avatarLionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Matthew Auld <matthew.auld@intel.com> \o/
Signed-off-by: default avatarBen Widawsky <ben@bwidawsk.net>
parent 5182f646
...@@ -2018,9 +2018,17 @@ struct i915_oa_ops { ...@@ -2018,9 +2018,17 @@ struct i915_oa_ops {
void (*init_oa_buffer)(struct drm_i915_private *dev_priv); void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
/** /**
* @enable_metric_set: Applies any MUX configuration to set up the * @select_metric_set: The auto generated code that checks whether a
* Boolean and Custom (B/C) counters that are part of the counter * requested OA config is applicable to the system and if so sets up
* reports being sampled. May apply system constraints such as * the mux, oa and flex eu register config pointers according to the
* current dev_priv->perf.oa.metrics_set.
*/
int (*select_metric_set)(struct drm_i915_private *dev_priv);
/**
* @enable_metric_set: Selects and applies any MUX configuration to set
* up the Boolean and Custom (B/C) counters that are part of the
* counter reports being sampled. May apply system constraints such as
* disabling EU clock gating as required. * disabling EU clock gating as required.
*/ */
int (*enable_metric_set)(struct drm_i915_private *dev_priv); int (*enable_metric_set)(struct drm_i915_private *dev_priv);
...@@ -2051,20 +2059,13 @@ struct i915_oa_ops { ...@@ -2051,20 +2059,13 @@ struct i915_oa_ops {
size_t *offset); size_t *offset);
/** /**
* @oa_buffer_check: Check for OA buffer data + update tail * @oa_hw_tail_read: read the OA tail pointer register
*
* This is either called via fops or the poll check hrtimer (atomic
* ctx) without any locks taken.
* *
* It's safe to read OA config state here unlocked, assuming that this * In particular this enables us to share all the fiddly code for
* is only called while the stream is enabled, while the global OA * handling the OA unit tail pointer race that affects multiple
* configuration can't be modified. * generations.
*
* Efficiency is more important than avoiding some false positives
* here, which will be handled gracefully - likely resulting in an
* %EAGAIN error for userspace.
*/ */
bool (*oa_buffer_check)(struct drm_i915_private *dev_priv); u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
}; };
struct intel_cdclk_state { struct intel_cdclk_state {
...@@ -2429,6 +2430,7 @@ struct drm_i915_private { ...@@ -2429,6 +2430,7 @@ struct drm_i915_private {
struct { struct {
struct i915_vma *vma; struct i915_vma *vma;
u8 *vaddr; u8 *vaddr;
u32 last_ctx_id;
int format; int format;
int format_size; int format_size;
...@@ -2498,6 +2500,15 @@ struct drm_i915_private { ...@@ -2498,6 +2500,15 @@ struct drm_i915_private {
} oa_buffer; } oa_buffer;
u32 gen7_latched_oastatus1; u32 gen7_latched_oastatus1;
u32 ctx_oactxctrl_offset;
u32 ctx_flexeu0_offset;
/**
* The RPT_ID/reason field for Gen8+ includes a bit
* to determine if the CTX ID in the report is valid
* but the specific bit differs between Gen 8 and 9
*/
u32 gen8_valid_ctx_bit;
struct i915_oa_ops ops; struct i915_oa_ops ops;
const struct i915_oa_format *oa_formats; const struct i915_oa_format *oa_formats;
...@@ -2810,6 +2821,8 @@ intel_info(const struct drm_i915_private *dev_priv) ...@@ -2810,6 +2821,8 @@ intel_info(const struct drm_i915_private *dev_priv)
#define IS_KBL_ULX(dev_priv) (INTEL_DEVID(dev_priv) == 0x590E || \ #define IS_KBL_ULX(dev_priv) (INTEL_DEVID(dev_priv) == 0x590E || \
INTEL_DEVID(dev_priv) == 0x5915 || \ INTEL_DEVID(dev_priv) == 0x5915 || \
INTEL_DEVID(dev_priv) == 0x591E) INTEL_DEVID(dev_priv) == 0x591E)
#define IS_SKL_GT2(dev_priv) (IS_SKYLAKE(dev_priv) && \
(INTEL_DEVID(dev_priv) & 0x00F0) == 0x0010)
#define IS_SKL_GT3(dev_priv) (IS_SKYLAKE(dev_priv) && \ #define IS_SKL_GT3(dev_priv) (IS_SKYLAKE(dev_priv) && \
(INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020) (INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
#define IS_SKL_GT4(dev_priv) (IS_SKYLAKE(dev_priv) && \ #define IS_SKL_GT4(dev_priv) (IS_SKYLAKE(dev_priv) && \
...@@ -3554,6 +3567,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx, ...@@ -3554,6 +3567,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx,
int i915_perf_open_ioctl(struct drm_device *dev, void *data, int i915_perf_open_ioctl(struct drm_device *dev, void *data,
struct drm_file *file); struct drm_file *file);
void i915_oa_init_reg_state(struct intel_engine_cs *engine,
struct i915_gem_context *ctx,
uint32_t *reg_state);
/* i915_gem_evict.c */ /* i915_gem_evict.c */
int __must_check i915_gem_evict_something(struct i915_address_space *vm, int __must_check i915_gem_evict_something(struct i915_address_space *vm,
......
This diff is collapsed.
...@@ -656,6 +656,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) ...@@ -656,6 +656,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define GEN8_OACTXID _MMIO(0x2364) #define GEN8_OACTXID _MMIO(0x2364)
#define GEN8_OA_DEBUG _MMIO(0x2B04)
#define GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS (1<<5)
#define GEN9_OA_DEBUG_INCLUDE_CLK_RATIO (1<<6)
#define GEN9_OA_DEBUG_DISABLE_GO_1_0_REPORTS (1<<2)
#define GEN9_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1<<1)
#define GEN8_OACONTROL _MMIO(0x2B00) #define GEN8_OACONTROL _MMIO(0x2B00)
#define GEN8_OA_REPORT_FORMAT_A12 (0<<2) #define GEN8_OA_REPORT_FORMAT_A12 (0<<2)
#define GEN8_OA_REPORT_FORMAT_A12_B8_C8 (2<<2) #define GEN8_OA_REPORT_FORMAT_A12_B8_C8 (2<<2)
...@@ -677,6 +683,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) ...@@ -677,6 +683,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define GEN7_OABUFFER_STOP_RESUME_ENABLE (1<<1) #define GEN7_OABUFFER_STOP_RESUME_ENABLE (1<<1)
#define GEN7_OABUFFER_RESUME (1<<0) #define GEN7_OABUFFER_RESUME (1<<0)
#define GEN8_OABUFFER_UDW _MMIO(0x23b4)
#define GEN8_OABUFFER _MMIO(0x2b14) #define GEN8_OABUFFER _MMIO(0x2b14)
#define GEN7_OASTATUS1 _MMIO(0x2364) #define GEN7_OASTATUS1 _MMIO(0x2364)
...@@ -695,7 +702,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) ...@@ -695,7 +702,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define GEN8_OASTATUS_REPORT_LOST (1<<0) #define GEN8_OASTATUS_REPORT_LOST (1<<0)
#define GEN8_OAHEADPTR _MMIO(0x2B0C) #define GEN8_OAHEADPTR _MMIO(0x2B0C)
#define GEN8_OAHEADPTR_MASK 0xffffffc0
#define GEN8_OATAILPTR _MMIO(0x2B10) #define GEN8_OATAILPTR _MMIO(0x2B10)
#define GEN8_OATAILPTR_MASK 0xffffffc0
#define OABUFFER_SIZE_128K (0<<3) #define OABUFFER_SIZE_128K (0<<3)
#define OABUFFER_SIZE_256K (1<<3) #define OABUFFER_SIZE_256K (1<<3)
...@@ -708,7 +717,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg) ...@@ -708,7 +717,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
#define OA_MEM_SELECT_GGTT (1<<0) #define OA_MEM_SELECT_GGTT (1<<0)
/*
* Flexible, Aggregate EU Counter Registers.
* Note: these aren't contiguous
*/
#define EU_PERF_CNTL0 _MMIO(0xe458) #define EU_PERF_CNTL0 _MMIO(0xe458)
#define EU_PERF_CNTL1 _MMIO(0xe558)
#define EU_PERF_CNTL2 _MMIO(0xe658)
#define EU_PERF_CNTL3 _MMIO(0xe758)
#define EU_PERF_CNTL4 _MMIO(0xe45c)
#define EU_PERF_CNTL5 _MMIO(0xe55c)
#define EU_PERF_CNTL6 _MMIO(0xe65c)
#define GDT_CHICKEN_BITS _MMIO(0x9840) #define GDT_CHICKEN_BITS _MMIO(0x9840)
#define GT_NOA_ENABLE 0x00000080 #define GT_NOA_ENABLE 0x00000080
...@@ -2494,6 +2513,9 @@ enum skl_disp_power_wells { ...@@ -2494,6 +2513,9 @@ enum skl_disp_power_wells {
#define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12) #define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12)
#define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10) #define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10)
#define GEN6_RCS_PWR_FSM _MMIO(0x22ac)
#define GEN9_RCS_FE_FSM2 _MMIO(0x22a4)
/* Fuse readout registers for GT */ /* Fuse readout registers for GT */
#define CHV_FUSE_GT _MMIO(VLV_DISPLAY_BASE + 0x2168) #define CHV_FUSE_GT _MMIO(VLV_DISPLAY_BASE + 0x2168)
#define CHV_FGT_DISABLE_SS0 (1 << 10) #define CHV_FGT_DISABLE_SS0 (1 << 10)
......
...@@ -1962,6 +1962,8 @@ static void execlists_init_reg_state(u32 *regs, ...@@ -1962,6 +1962,8 @@ static void execlists_init_reg_state(u32 *regs,
regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1); regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE, CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
make_rpcs(dev_priv)); make_rpcs(dev_priv));
i915_oa_init_reg_state(engine, ctx, regs);
} }
} }
......
...@@ -1316,13 +1316,18 @@ struct drm_i915_gem_context_param { ...@@ -1316,13 +1316,18 @@ struct drm_i915_gem_context_param {
}; };
enum drm_i915_oa_format { enum drm_i915_oa_format {
I915_OA_FORMAT_A13 = 1, I915_OA_FORMAT_A13 = 1, /* HSW only */
I915_OA_FORMAT_A29, I915_OA_FORMAT_A29, /* HSW only */
I915_OA_FORMAT_A13_B8_C8, I915_OA_FORMAT_A13_B8_C8, /* HSW only */
I915_OA_FORMAT_B4_C8, I915_OA_FORMAT_B4_C8, /* HSW only */
I915_OA_FORMAT_A45_B8_C8, I915_OA_FORMAT_A45_B8_C8, /* HSW only */
I915_OA_FORMAT_B4_C8_A16, I915_OA_FORMAT_B4_C8_A16, /* HSW only */
I915_OA_FORMAT_C4_B8, I915_OA_FORMAT_C4_B8, /* HSW+ */
/* Gen8+ */
I915_OA_FORMAT_A12,
I915_OA_FORMAT_A12_B8_C8,
I915_OA_FORMAT_A32u40_A4u32_B8_C8,
I915_OA_FORMAT_MAX /* non-ABI */ I915_OA_FORMAT_MAX /* non-ABI */
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment