From cf66b8a0ba142fbd1bf10ac8f3ae92d1b0cb7b8f Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 6 Dec 2018 08:44:31 +0000 Subject: [PATCH 1/3] drm/i915/execlists: Apply a full mb before execution for Braswell Braswell is really picky about having our writes posted to memory before we execute or else the GPU may see stale values. A wmb() is insufficient as it only ensures the writes are visible to other cores, we need a full mb() to ensure the writes are in memory and visible to the GPU. The most frequent failure in flushing before execution is that we see stale PTE values and execute the wrong pages. References: 987abd5c62f9 ("drm/i915/execlists: Force write serialisation into context image vs execution") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: stable@vger.kernel.org Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20181206084431.9805-3-chris@chris-wilson.co.uk (cherry picked from commit 490b8c65b9db45896769e1095e78725775f47b3e) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/intel_lrc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index 4acb24c90c68..e2cf4f750c66 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -442,8 +442,13 @@ static u64 execlists_update_context(struct i915_request *rq) * may not be visible to the HW prior to the completion of the UC * register write and that we may begin execution from the context * before its image is complete leading to invalid PD chasing. + * + * Furthermore, Braswell, at least, wants a full mb to be sure that + * the writes are coherent in memory (visible to the GPU) prior to + * execution, and not just visible to other CPUs (as is the result of + * wmb). */ - wmb(); + mb(); return ce->lrc_desc; } From fe78742d7f84e3803b760cdbb1ddbdedfca9640b Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 4 Dec 2018 14:15:16 +0000 Subject: [PATCH 2/3] drm/i915: Allocate a common scratch page Currently we allocate a scratch page for each engine, but since we only ever write into it for post-sync operations, it is not exposed to userspace nor do we care for coherency. As we then do not care about its contents, we can use one page for all, reducing our allocations and avoid complications by not assuming per-engine isolation. For later use, it simplifies engine initialisation (by removing the allocation that required struct_mutex!) and means that we can always rely on there being a scratch page. v2: Check that we allocated a large enough scratch for I830 w/a Fixes: 06e562e7f515 ("drm/i915/ringbuffer: Delay after EMIT_INVALIDATE for gen4/gen5") # v4.18.20 Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=108850 Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20181204141522.13640-1-chris@chris-wilson.co.uk Cc: Joonas Lahtinen Cc: # v4.18.20+ (cherry picked from commit 5179749925933575a67f9d8f16d0cc204f98a29f) [Joonas: Use new function in gen9_init_indirectctx_bb too] Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_drv.h | 7 ++++ drivers/gpu/drm/i915/i915_gem.c | 50 ++++++++++++++++++++++++- drivers/gpu/drm/i915/i915_gpu_error.c | 2 +- drivers/gpu/drm/i915/intel_engine_cs.c | 42 --------------------- drivers/gpu/drm/i915/intel_lrc.c | 19 +++------- drivers/gpu/drm/i915/intel_ringbuffer.c | 37 ++++++------------ drivers/gpu/drm/i915/intel_ringbuffer.h | 5 --- 7 files changed, 75 insertions(+), 87 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 1331cde79c2e..872a2e159a5f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -2150,6 +2150,8 @@ struct drm_i915_private { struct delayed_work idle_work; ktime_t last_init_time; + + struct i915_vma *scratch; } gt; /* perform PHY state sanity checks? */ @@ -3872,4 +3874,9 @@ static inline int intel_hws_csb_write_index(struct drm_i915_private *i915) return I915_HWS_CSB_WRITE_INDEX; } +static inline u32 i915_scratch_offset(const struct drm_i915_private *i915) +{ + return i915_ggtt_offset(i915->gt.scratch); +} + #endif diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index ba371712c560..6ae9a6080cc8 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -5500,6 +5500,44 @@ err_active: goto out_ctx; } +static int +i915_gem_init_scratch(struct drm_i915_private *i915, unsigned int size) +{ + struct drm_i915_gem_object *obj; + struct i915_vma *vma; + int ret; + + obj = i915_gem_object_create_stolen(i915, size); + if (!obj) + obj = i915_gem_object_create_internal(i915, size); + if (IS_ERR(obj)) { + DRM_ERROR("Failed to allocate scratch page\n"); + return PTR_ERR(obj); + } + + vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto err_unref; + } + + ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); + if (ret) + goto err_unref; + + i915->gt.scratch = vma; + return 0; + +err_unref: + i915_gem_object_put(obj); + return ret; +} + +static void i915_gem_fini_scratch(struct drm_i915_private *i915) +{ + i915_vma_unpin_and_release(&i915->gt.scratch, 0); +} + int i915_gem_init(struct drm_i915_private *dev_priv) { int ret; @@ -5546,12 +5584,19 @@ int i915_gem_init(struct drm_i915_private *dev_priv) goto err_unlock; } - ret = i915_gem_contexts_init(dev_priv); + ret = i915_gem_init_scratch(dev_priv, + IS_GEN2(dev_priv) ? SZ_256K : PAGE_SIZE); if (ret) { GEM_BUG_ON(ret == -EIO); goto err_ggtt; } + ret = i915_gem_contexts_init(dev_priv); + if (ret) { + GEM_BUG_ON(ret == -EIO); + goto err_scratch; + } + ret = intel_engines_init(dev_priv); if (ret) { GEM_BUG_ON(ret == -EIO); @@ -5624,6 +5669,8 @@ err_pm: err_context: if (ret != -EIO) i915_gem_contexts_fini(dev_priv); +err_scratch: + i915_gem_fini_scratch(dev_priv); err_ggtt: err_unlock: intel_uncore_forcewake_put(dev_priv, FORCEWAKE_ALL); @@ -5675,6 +5722,7 @@ void i915_gem_fini(struct drm_i915_private *dev_priv) intel_uc_fini(dev_priv); i915_gem_cleanup_engines(dev_priv); i915_gem_contexts_fini(dev_priv); + i915_gem_fini_scratch(dev_priv); mutex_unlock(&dev_priv->drm.struct_mutex); intel_wa_list_free(&dev_priv->gt_wa_list); diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index 3eb33e000d6f..db4128d6c09b 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -1495,7 +1495,7 @@ static void gem_record_rings(struct i915_gpu_state *error) if (HAS_BROKEN_CS_TLB(i915)) ee->wa_batchbuffer = i915_error_object_create(i915, - engine->scratch); + i915->gt.scratch); request_record_user_bo(request, ee); ee->ctx = diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index cdfa6b21cbff..76b5f94ea6cb 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -490,46 +490,6 @@ void intel_engine_setup_common(struct intel_engine_cs *engine) intel_engine_init_cmd_parser(engine); } -int intel_engine_create_scratch(struct intel_engine_cs *engine, - unsigned int size) -{ - struct drm_i915_gem_object *obj; - struct i915_vma *vma; - int ret; - - WARN_ON(engine->scratch); - - obj = i915_gem_object_create_stolen(engine->i915, size); - if (!obj) - obj = i915_gem_object_create_internal(engine->i915, size); - if (IS_ERR(obj)) { - DRM_ERROR("Failed to allocate scratch page\n"); - return PTR_ERR(obj); - } - - vma = i915_vma_instance(obj, &engine->i915->ggtt.vm, NULL); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); - goto err_unref; - } - - ret = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH); - if (ret) - goto err_unref; - - engine->scratch = vma; - return 0; - -err_unref: - i915_gem_object_put(obj); - return ret; -} - -void intel_engine_cleanup_scratch(struct intel_engine_cs *engine) -{ - i915_vma_unpin_and_release(&engine->scratch, 0); -} - static void cleanup_status_page(struct intel_engine_cs *engine) { if (HWS_NEEDS_PHYSICAL(engine->i915)) { @@ -704,8 +664,6 @@ void intel_engine_cleanup_common(struct intel_engine_cs *engine) { struct drm_i915_private *i915 = engine->i915; - intel_engine_cleanup_scratch(engine); - cleanup_status_page(engine); intel_engine_fini_breadcrumbs(engine); diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c index e2cf4f750c66..58d1d3d47dd3 100644 --- a/drivers/gpu/drm/i915/intel_lrc.c +++ b/drivers/gpu/drm/i915/intel_lrc.c @@ -1448,9 +1448,10 @@ static int execlists_request_alloc(struct i915_request *request) static u32 * gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) { + /* NB no one else is allowed to scribble over scratch + 256! */ *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); - *batch++ = i915_ggtt_offset(engine->scratch) + 256; + *batch++ = i915_scratch_offset(engine->i915) + 256; *batch++ = 0; *batch++ = MI_LOAD_REGISTER_IMM(1); @@ -1464,7 +1465,7 @@ gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch) *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT; *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4); - *batch++ = i915_ggtt_offset(engine->scratch) + 256; + *batch++ = i915_scratch_offset(engine->i915) + 256; *batch++ = 0; return batch; @@ -1501,7 +1502,7 @@ static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL | PIPE_CONTROL_QW_WRITE, - i915_ggtt_offset(engine->scratch) + + i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES); *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; @@ -1578,7 +1579,7 @@ static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch) PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL | PIPE_CONTROL_QW_WRITE, - i915_ggtt_offset(engine->scratch) + i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES); } @@ -2146,7 +2147,7 @@ static int gen8_emit_flush_render(struct i915_request *request, { struct intel_engine_cs *engine = request->engine; u32 scratch_addr = - i915_ggtt_offset(engine->scratch) + 2 * CACHELINE_BYTES; + i915_scratch_offset(engine->i915) + 2 * CACHELINE_BYTES; bool vf_flush_wa = false, dc_flush_wa = false; u32 *cs, flags = 0; int len; @@ -2483,10 +2484,6 @@ int logical_render_ring_init(struct intel_engine_cs *engine) if (ret) return ret; - ret = intel_engine_create_scratch(engine, PAGE_SIZE); - if (ret) - goto err_cleanup_common; - ret = intel_init_workaround_bb(engine); if (ret) { /* @@ -2501,10 +2498,6 @@ int logical_render_ring_init(struct intel_engine_cs *engine) intel_engine_init_workarounds(engine); return 0; - -err_cleanup_common: - intel_engine_cleanup_common(engine); - return ret; } int logical_xcs_ring_init(struct intel_engine_cs *engine) diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 187bb0ceb4ac..2046f108d1cc 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -150,8 +150,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode) */ if (mode & EMIT_INVALIDATE) { *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; - *cs++ = i915_ggtt_offset(rq->engine->scratch) | - PIPE_CONTROL_GLOBAL_GTT; + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; *cs++ = 0; @@ -159,8 +158,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode) *cs++ = MI_FLUSH; *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; - *cs++ = i915_ggtt_offset(rq->engine->scratch) | - PIPE_CONTROL_GLOBAL_GTT; + *cs++ = i915_scratch_offset(rq->i915) | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; *cs++ = 0; } @@ -212,8 +210,7 @@ gen4_render_ring_flush(struct i915_request *rq, u32 mode) static int intel_emit_post_sync_nonzero_flush(struct i915_request *rq) { - u32 scratch_addr = - i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES; + u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; u32 *cs; cs = intel_ring_begin(rq, 6); @@ -246,8 +243,7 @@ intel_emit_post_sync_nonzero_flush(struct i915_request *rq) static int gen6_render_ring_flush(struct i915_request *rq, u32 mode) { - u32 scratch_addr = - i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES; + u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; u32 *cs, flags = 0; int ret; @@ -316,8 +312,7 @@ gen7_render_ring_cs_stall_wa(struct i915_request *rq) static int gen7_render_ring_flush(struct i915_request *rq, u32 mode) { - u32 scratch_addr = - i915_ggtt_offset(rq->engine->scratch) + 2 * CACHELINE_BYTES; + u32 scratch_addr = i915_scratch_offset(rq->i915) + 2 * CACHELINE_BYTES; u32 *cs, flags = 0; /* @@ -971,7 +966,7 @@ i965_emit_bb_start(struct i915_request *rq, } /* Just userspace ABI convention to limit the wa batch bo to a resonable size */ -#define I830_BATCH_LIMIT (256*1024) +#define I830_BATCH_LIMIT SZ_256K #define I830_TLB_ENTRIES (2) #define I830_WA_SIZE max(I830_TLB_ENTRIES*4096, I830_BATCH_LIMIT) static int @@ -979,7 +974,9 @@ i830_emit_bb_start(struct i915_request *rq, u64 offset, u32 len, unsigned int dispatch_flags) { - u32 *cs, cs_offset = i915_ggtt_offset(rq->engine->scratch); + u32 *cs, cs_offset = i915_scratch_offset(rq->i915); + + GEM_BUG_ON(rq->i915->gt.scratch->size < I830_WA_SIZE); cs = intel_ring_begin(rq, 6); if (IS_ERR(cs)) @@ -1437,7 +1434,6 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine) { struct i915_timeline *timeline; struct intel_ring *ring; - unsigned int size; int err; intel_engine_setup_common(engine); @@ -1462,21 +1458,12 @@ static int intel_init_ring_buffer(struct intel_engine_cs *engine) GEM_BUG_ON(engine->buffer); engine->buffer = ring; - size = PAGE_SIZE; - if (HAS_BROKEN_CS_TLB(engine->i915)) - size = I830_WA_SIZE; - err = intel_engine_create_scratch(engine, size); + err = intel_engine_init_common(engine); if (err) goto err_unpin; - err = intel_engine_init_common(engine); - if (err) - goto err_scratch; - return 0; -err_scratch: - intel_engine_cleanup_scratch(engine); err_unpin: intel_ring_unpin(ring); err_ring: @@ -1550,7 +1537,7 @@ static int flush_pd_dir(struct i915_request *rq) /* Stall until the page table load is complete */ *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; *cs++ = i915_mmio_reg_offset(RING_PP_DIR_BASE(engine)); - *cs++ = i915_ggtt_offset(engine->scratch); + *cs++ = i915_scratch_offset(rq->i915); *cs++ = MI_NOOP; intel_ring_advance(rq, cs); @@ -1659,7 +1646,7 @@ static inline int mi_set_context(struct i915_request *rq, u32 flags) /* Insert a delay before the next switch! */ *cs++ = MI_STORE_REGISTER_MEM | MI_SRM_LRM_GLOBAL_GTT; *cs++ = i915_mmio_reg_offset(last_reg); - *cs++ = i915_ggtt_offset(engine->scratch); + *cs++ = i915_scratch_offset(rq->i915); *cs++ = MI_NOOP; } *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index 04c61307f4d1..767a7192c969 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -442,7 +442,6 @@ struct intel_engine_cs { struct intel_hw_status_page status_page; struct i915_ctx_workarounds wa_ctx; struct i915_wa_list wa_list; - struct i915_vma *scratch; u32 irq_keep_mask; /* always keep these interrupts */ u32 irq_enable_mask; /* bitmask to enable ring interrupt */ @@ -900,10 +899,6 @@ void intel_engine_setup_common(struct intel_engine_cs *engine); int intel_engine_init_common(struct intel_engine_cs *engine); void intel_engine_cleanup_common(struct intel_engine_cs *engine); -int intel_engine_create_scratch(struct intel_engine_cs *engine, - unsigned int size); -void intel_engine_cleanup_scratch(struct intel_engine_cs *engine); - int intel_init_render_ring_buffer(struct intel_engine_cs *engine); int intel_init_bsd_ring_buffer(struct intel_engine_cs *engine); int intel_init_blt_ring_buffer(struct intel_engine_cs *engine); From 5b2e31201c268c2331a209af799d667619216d40 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Fri, 7 Dec 2018 13:40:37 +0000 Subject: [PATCH 3/3] drm/i915: Flush GPU relocs harder for gen3 Adding an extra MI_STORE_DWORD_IMM to the gpu relocation path for gen3 was good, but still not good enough. To survive 24+ hours under test we needed to perform not one, not two but three extra store-dw. Doing so for each GPU relocation was a little unsightly and since we need to worry about userspace hitting the same issues, we should apply the dummy store-dw into the EMIT_FLUSH. Fixes: 7dd4f6729f92 ("drm/i915: Async GPU relocation processing") References: 7fa28e146994 ("drm/i915: Write GPU relocs harder with gen3") Testcase: igt/gem_tiled_fence_blits # blb/pnv Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Reviewed-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20181207134037.11848-1-chris@chris-wilson.co.uk (cherry picked from commit a889580c087a9cf91fddb3832ece284174214183) Signed-off-by: Joonas Lahtinen --- drivers/gpu/drm/i915/i915_gem_execbuffer.c | 7 +------ drivers/gpu/drm/i915/intel_ringbuffer.c | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index d4fac09095f8..1aaccbe7e1de 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -1268,7 +1268,7 @@ relocate_entry(struct i915_vma *vma, else if (gen >= 4) len = 4; else - len = 6; + len = 3; batch = reloc_gpu(eb, vma, len); if (IS_ERR(batch)) @@ -1309,11 +1309,6 @@ relocate_entry(struct i915_vma *vma, *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; *batch++ = addr; *batch++ = target_offset; - - /* And again for good measure (blb/pnv) */ - *batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; - *batch++ = addr; - *batch++ = target_offset; } goto out; diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.c b/drivers/gpu/drm/i915/intel_ringbuffer.c index 2046f108d1cc..1f8d2a66c791 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.c +++ b/drivers/gpu/drm/i915/intel_ringbuffer.c @@ -69,19 +69,28 @@ unsigned int intel_ring_update_space(struct intel_ring *ring) static int gen2_render_ring_flush(struct i915_request *rq, u32 mode) { + unsigned int num_store_dw; u32 cmd, *cs; cmd = MI_FLUSH; - + num_store_dw = 0; if (mode & EMIT_INVALIDATE) cmd |= MI_READ_FLUSH; + if (mode & EMIT_FLUSH) + num_store_dw = 4; - cs = intel_ring_begin(rq, 2); + cs = intel_ring_begin(rq, 2 + 3 * num_store_dw); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = cmd; - *cs++ = MI_NOOP; + while (num_store_dw--) { + *cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL; + *cs++ = i915_scratch_offset(rq->i915); + *cs++ = 0; + } + *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH; + intel_ring_advance(rq, cs); return 0;