drm/i915/perf: Add OA unit support for Gen 8+
Enables access to OA unit metrics for BDW, CHV, SKL and BXT which all share (more-or-less) the same OA unit design. Of particular note in comparison to Haswell: some OA unit HW config state has become per-context state and as a consequence it is somewhat more complicated to manage synchronous state changes from the cpu while there's no guarantee of what context (if any) is currently actively running on the gpu. The periodic sampling frequency which can be particularly useful for system-wide analysis (as opposed to command stream synchronised MI_REPORT_PERF_COUNT commands) is perhaps the most surprising state to have become per-context save and restored (while the OABUFFER destination is still a shared, system-wide resource). This support for gen8+ takes care to consider a number of timing challenges involved in synchronously updating per-context state primarily by programming all config state from the cpu and updating all current and saved contexts synchronously while the OA unit is still disabled. The driver intentionally avoids depending on command streamer programming to update OA state considering the lack of synchronization between the automatic loading of OACTXCONTROL state (that includes the periodic sampling state and enable state) on context restore and the parsing of any general purpose BB the driver can control. I.e. this implementation is careful to avoid the possibility of a context restore temporarily enabling any out-of-date periodic sampling state. In addition to the risk of transiently-out-of-date state being loaded automatically; there are also internal HW latencies involved in the loading of MUX configurations which would be difficult to account for from the command streamer (and we only want to enable the unit when once the MUX configuration is complete). Since the Gen8+ OA unit design no longer supports clock gating the unit off for a single given context (which effectively stopped any progress of counters while any other context was running) and instead supports tagging OA reports with a context ID for filtering on the CPU, it means we can no longer hide the system-wide progress of counters from a non-privileged application only interested in metrics for its own context. Although we could theoretically try and subtract the progress of other contexts before forwarding reports via read() we aren't in a position to filter reports captured via MI_REPORT_PERF_COUNT commands. As a result, for Gen8+, we always require the dev.i915.perf_stream_paranoid to be unset for any access to OA metrics if not root. v5: Drain submitted requests when enabling metric set to ensure no lite-restore erases the context image we just updated (Lionel) v6: In addition to drain, switch to kernel context & update all context in place (Chris) v7: Add missing mutex_unlock() if switching to kernel context fails (Matthew) v8: Simplify OA period/flex-eu-counters programming by using the batchbuffer instead of modifying ctx-image (Lionel) v9: Back to updating the context image (due to erroneous testing, batchbuffer programming the OA unit doesn't actually work) (Lionel) Pin context before updating context image (Chris) Drop MMIO programming now that we switch to a kernel context with right values in initial context image (Chris) v10: Just pin_map the contexts we want to modify or let the configuration happen on first use (Chris) v11: Update kernel context OA config through the batchbuffer rather than on the fly ctx-image update (Lionel) v12: Rework OA context registers update again by swithing away from user contexts and reconfiguring the kernel context through the batchbuffer and updating all the other contexts' context image. Also take care to lock slice/subslice configuration when OA is on. (Lionel) v13: Request rpcs updates on all engine when updating the OA config (Lionel) v14: Drop any kind of rpcs management now that we monitor sseu configuration changes in a later patch (Lionel) Remove usleep after programming the NOA configs on Gen8+, this doesn't seem to be needed (Lionel) v15: Respect coding style for block comments (Chris) v16: Add missing i915_add_request() in case we fail to emit OA configuration (Matthew) Signed-off-by: Robert Bragg <robert@sixbynine.org> Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Matthew Auld <matthew.auld@intel.com> \o/ Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
This commit is contained in:
parent
5182f646c7
commit
19f81df285
|
@ -2018,9 +2018,17 @@ struct i915_oa_ops {
|
|||
void (*init_oa_buffer)(struct drm_i915_private *dev_priv);
|
||||
|
||||
/**
|
||||
* @enable_metric_set: Applies any MUX configuration to set up the
|
||||
* Boolean and Custom (B/C) counters that are part of the counter
|
||||
* reports being sampled. May apply system constraints such as
|
||||
* @select_metric_set: The auto generated code that checks whether a
|
||||
* requested OA config is applicable to the system and if so sets up
|
||||
* the mux, oa and flex eu register config pointers according to the
|
||||
* current dev_priv->perf.oa.metrics_set.
|
||||
*/
|
||||
int (*select_metric_set)(struct drm_i915_private *dev_priv);
|
||||
|
||||
/**
|
||||
* @enable_metric_set: Selects and applies any MUX configuration to set
|
||||
* up the Boolean and Custom (B/C) counters that are part of the
|
||||
* counter reports being sampled. May apply system constraints such as
|
||||
* disabling EU clock gating as required.
|
||||
*/
|
||||
int (*enable_metric_set)(struct drm_i915_private *dev_priv);
|
||||
|
@ -2051,20 +2059,13 @@ struct i915_oa_ops {
|
|||
size_t *offset);
|
||||
|
||||
/**
|
||||
* @oa_buffer_check: Check for OA buffer data + update tail
|
||||
* @oa_hw_tail_read: read the OA tail pointer register
|
||||
*
|
||||
* This is either called via fops or the poll check hrtimer (atomic
|
||||
* ctx) without any locks taken.
|
||||
*
|
||||
* It's safe to read OA config state here unlocked, assuming that this
|
||||
* is only called while the stream is enabled, while the global OA
|
||||
* configuration can't be modified.
|
||||
*
|
||||
* Efficiency is more important than avoiding some false positives
|
||||
* here, which will be handled gracefully - likely resulting in an
|
||||
* %EAGAIN error for userspace.
|
||||
* In particular this enables us to share all the fiddly code for
|
||||
* handling the OA unit tail pointer race that affects multiple
|
||||
* generations.
|
||||
*/
|
||||
bool (*oa_buffer_check)(struct drm_i915_private *dev_priv);
|
||||
u32 (*oa_hw_tail_read)(struct drm_i915_private *dev_priv);
|
||||
};
|
||||
|
||||
struct intel_cdclk_state {
|
||||
|
@ -2429,6 +2430,7 @@ struct drm_i915_private {
|
|||
struct {
|
||||
struct i915_vma *vma;
|
||||
u8 *vaddr;
|
||||
u32 last_ctx_id;
|
||||
int format;
|
||||
int format_size;
|
||||
|
||||
|
@ -2498,6 +2500,15 @@ struct drm_i915_private {
|
|||
} oa_buffer;
|
||||
|
||||
u32 gen7_latched_oastatus1;
|
||||
u32 ctx_oactxctrl_offset;
|
||||
u32 ctx_flexeu0_offset;
|
||||
|
||||
/**
|
||||
* The RPT_ID/reason field for Gen8+ includes a bit
|
||||
* to determine if the CTX ID in the report is valid
|
||||
* but the specific bit differs between Gen 8 and 9
|
||||
*/
|
||||
u32 gen8_valid_ctx_bit;
|
||||
|
||||
struct i915_oa_ops ops;
|
||||
const struct i915_oa_format *oa_formats;
|
||||
|
@ -2810,6 +2821,8 @@ intel_info(const struct drm_i915_private *dev_priv)
|
|||
#define IS_KBL_ULX(dev_priv) (INTEL_DEVID(dev_priv) == 0x590E || \
|
||||
INTEL_DEVID(dev_priv) == 0x5915 || \
|
||||
INTEL_DEVID(dev_priv) == 0x591E)
|
||||
#define IS_SKL_GT2(dev_priv) (IS_SKYLAKE(dev_priv) && \
|
||||
(INTEL_DEVID(dev_priv) & 0x00F0) == 0x0010)
|
||||
#define IS_SKL_GT3(dev_priv) (IS_SKYLAKE(dev_priv) && \
|
||||
(INTEL_DEVID(dev_priv) & 0x00F0) == 0x0020)
|
||||
#define IS_SKL_GT4(dev_priv) (IS_SKYLAKE(dev_priv) && \
|
||||
|
@ -3554,6 +3567,9 @@ i915_gem_context_lookup_timeline(struct i915_gem_context *ctx,
|
|||
|
||||
int i915_perf_open_ioctl(struct drm_device *dev, void *data,
|
||||
struct drm_file *file);
|
||||
void i915_oa_init_reg_state(struct intel_engine_cs *engine,
|
||||
struct i915_gem_context *ctx,
|
||||
uint32_t *reg_state);
|
||||
|
||||
/* i915_gem_evict.c */
|
||||
int __must_check i915_gem_evict_something(struct i915_address_space *vm,
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -656,6 +656,12 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
|
|||
|
||||
#define GEN8_OACTXID _MMIO(0x2364)
|
||||
|
||||
#define GEN8_OA_DEBUG _MMIO(0x2B04)
|
||||
#define GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS (1<<5)
|
||||
#define GEN9_OA_DEBUG_INCLUDE_CLK_RATIO (1<<6)
|
||||
#define GEN9_OA_DEBUG_DISABLE_GO_1_0_REPORTS (1<<2)
|
||||
#define GEN9_OA_DEBUG_DISABLE_CTX_SWITCH_REPORTS (1<<1)
|
||||
|
||||
#define GEN8_OACONTROL _MMIO(0x2B00)
|
||||
#define GEN8_OA_REPORT_FORMAT_A12 (0<<2)
|
||||
#define GEN8_OA_REPORT_FORMAT_A12_B8_C8 (2<<2)
|
||||
|
@ -677,6 +683,7 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
|
|||
#define GEN7_OABUFFER_STOP_RESUME_ENABLE (1<<1)
|
||||
#define GEN7_OABUFFER_RESUME (1<<0)
|
||||
|
||||
#define GEN8_OABUFFER_UDW _MMIO(0x23b4)
|
||||
#define GEN8_OABUFFER _MMIO(0x2b14)
|
||||
|
||||
#define GEN7_OASTATUS1 _MMIO(0x2364)
|
||||
|
@ -695,7 +702,9 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
|
|||
#define GEN8_OASTATUS_REPORT_LOST (1<<0)
|
||||
|
||||
#define GEN8_OAHEADPTR _MMIO(0x2B0C)
|
||||
#define GEN8_OAHEADPTR_MASK 0xffffffc0
|
||||
#define GEN8_OATAILPTR _MMIO(0x2B10)
|
||||
#define GEN8_OATAILPTR_MASK 0xffffffc0
|
||||
|
||||
#define OABUFFER_SIZE_128K (0<<3)
|
||||
#define OABUFFER_SIZE_256K (1<<3)
|
||||
|
@ -708,7 +717,17 @@ static inline bool i915_mmio_reg_valid(i915_reg_t reg)
|
|||
|
||||
#define OA_MEM_SELECT_GGTT (1<<0)
|
||||
|
||||
/*
|
||||
* Flexible, Aggregate EU Counter Registers.
|
||||
* Note: these aren't contiguous
|
||||
*/
|
||||
#define EU_PERF_CNTL0 _MMIO(0xe458)
|
||||
#define EU_PERF_CNTL1 _MMIO(0xe558)
|
||||
#define EU_PERF_CNTL2 _MMIO(0xe658)
|
||||
#define EU_PERF_CNTL3 _MMIO(0xe758)
|
||||
#define EU_PERF_CNTL4 _MMIO(0xe45c)
|
||||
#define EU_PERF_CNTL5 _MMIO(0xe55c)
|
||||
#define EU_PERF_CNTL6 _MMIO(0xe65c)
|
||||
|
||||
#define GDT_CHICKEN_BITS _MMIO(0x9840)
|
||||
#define GT_NOA_ENABLE 0x00000080
|
||||
|
@ -2494,6 +2513,9 @@ enum skl_disp_power_wells {
|
|||
#define GEN8_RC_SEMA_IDLE_MSG_DISABLE (1 << 12)
|
||||
#define GEN8_FF_DOP_CLOCK_GATE_DISABLE (1<<10)
|
||||
|
||||
#define GEN6_RCS_PWR_FSM _MMIO(0x22ac)
|
||||
#define GEN9_RCS_FE_FSM2 _MMIO(0x22a4)
|
||||
|
||||
/* Fuse readout registers for GT */
|
||||
#define CHV_FUSE_GT _MMIO(VLV_DISPLAY_BASE + 0x2168)
|
||||
#define CHV_FGT_DISABLE_SS0 (1 << 10)
|
||||
|
|
|
@ -1962,6 +1962,8 @@ static void execlists_init_reg_state(u32 *regs,
|
|||
regs[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
|
||||
CTX_REG(regs, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
|
||||
make_rpcs(dev_priv));
|
||||
|
||||
i915_oa_init_reg_state(engine, ctx, regs);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -1316,13 +1316,18 @@ struct drm_i915_gem_context_param {
|
|||
};
|
||||
|
||||
enum drm_i915_oa_format {
|
||||
I915_OA_FORMAT_A13 = 1,
|
||||
I915_OA_FORMAT_A29,
|
||||
I915_OA_FORMAT_A13_B8_C8,
|
||||
I915_OA_FORMAT_B4_C8,
|
||||
I915_OA_FORMAT_A45_B8_C8,
|
||||
I915_OA_FORMAT_B4_C8_A16,
|
||||
I915_OA_FORMAT_C4_B8,
|
||||
I915_OA_FORMAT_A13 = 1, /* HSW only */
|
||||
I915_OA_FORMAT_A29, /* HSW only */
|
||||
I915_OA_FORMAT_A13_B8_C8, /* HSW only */
|
||||
I915_OA_FORMAT_B4_C8, /* HSW only */
|
||||
I915_OA_FORMAT_A45_B8_C8, /* HSW only */
|
||||
I915_OA_FORMAT_B4_C8_A16, /* HSW only */
|
||||
I915_OA_FORMAT_C4_B8, /* HSW+ */
|
||||
|
||||
/* Gen8+ */
|
||||
I915_OA_FORMAT_A12,
|
||||
I915_OA_FORMAT_A12_B8_C8,
|
||||
I915_OA_FORMAT_A32u40_A4u32_B8_C8,
|
||||
|
||||
I915_OA_FORMAT_MAX /* non-ABI */
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue