drm/i915/execlists: Enable coarse preemption boundaries for gen8

When we introduced preemption, we chose to keep it disabled for gen8 as
supporting preemption inside GPGPU user batches required various w/a in
userspace. Since then, the desire to preempt long queues of requests
between batches (e.g. within busywaiting semaphores) has grown. So allow
arbitration within the busywaits and between requests, but disable
arbitration within user batches so that we can preempt between requests
and not risk breaking GPGPU.

However, since this preemption is much coarser and doesn't interfere
with userspace, we decline to include it amongst the scheduler
capabilities. (This is also required for us to skip over the preemption
selftests that expect to be able to preempt user batches.)

Michal suggested that we could perhaps allow preemption inside gen8
userspace batches if we can satisfy ourselves that the default
preemption settings are viable with existing userspace (principally
OpenCL which already should carry any known workaround). We could then
merge the two code paths back into one, even dropping the artifical
has-preemption device feature flag.

Testcase: igt/gem_exec_scheduler/semaphore-user
References: beecec9017 ("drm/i915/execlists: Preemption!")
Fixes: e886196469 ("drm/i915: Use HW semaphores for inter-engine synchronisation on gen8+")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Michal Winiarski <michal.winiarski@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Michal Winiarski <michal.winiarski@intel.com> #irc
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190329134024.5254-1-chris@chris-wilson.co.uk
This commit is contained in:
Chris Wilson 2019-03-29 13:40:24 +00:00
parent b66ea2c2cf
commit bac24f59f4
3 changed files with 217 additions and 15 deletions

View File

@ -562,7 +562,7 @@ static void init_contexts(struct drm_i915_private *i915)
static bool needs_preempt_context(struct drm_i915_private *i915) static bool needs_preempt_context(struct drm_i915_private *i915)
{ {
return HAS_LOGICAL_RING_PREEMPTION(i915); return HAS_EXECLISTS(i915);
} }
int i915_gem_contexts_init(struct drm_i915_private *dev_priv) int i915_gem_contexts_init(struct drm_i915_private *dev_priv)

View File

@ -233,7 +233,7 @@ static inline bool need_preempt(const struct intel_engine_cs *engine,
{ {
int last_prio; int last_prio;
if (!intel_engine_has_preemption(engine)) if (!engine->preempt_context)
return false; return false;
if (i915_request_completed(rq)) if (i915_request_completed(rq))
@ -2035,7 +2035,7 @@ static int gen8_emit_bb_start(struct i915_request *rq,
{ {
u32 *cs; u32 *cs;
cs = intel_ring_begin(rq, 6); cs = intel_ring_begin(rq, 4);
if (IS_ERR(cs)) if (IS_ERR(cs))
return PTR_ERR(cs); return PTR_ERR(cs);
@ -2046,19 +2046,37 @@ static int gen8_emit_bb_start(struct i915_request *rq,
* particular all the gen that do not need the w/a at all!), if we * particular all the gen that do not need the w/a at all!), if we
* took care to make sure that on every switch into this context * took care to make sure that on every switch into this context
* (both ordinary and for preemption) that arbitrartion was enabled * (both ordinary and for preemption) that arbitrartion was enabled
* we would be fine. However, there doesn't seem to be a downside to * we would be fine. However, for gen8 there is another w/a that
* being paranoid and making sure it is set before each batch and * requires us to not preempt inside GPGPU execution, so we keep
* every context-switch. * arbitration disabled for gen8 batches. Arbitration will be
* * re-enabled before we close the request
* Note that if we fail to enable arbitration before the request * (engine->emit_fini_breadcrumb).
* is complete, then we do not see the context-switch interrupt and
* the engine hangs (with RING_HEAD == RING_TAIL).
*
* That satisfies both the GPGPU w/a and our heavy-handed paranoia.
*/ */
*cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
/* FIXME(BDW+): Address space and security selectors. */
*cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
*cs++ = lower_32_bits(offset);
*cs++ = upper_32_bits(offset);
intel_ring_advance(rq, cs);
return 0;
}
static int gen9_emit_bb_start(struct i915_request *rq,
u64 offset, u32 len,
const unsigned int flags)
{
u32 *cs;
cs = intel_ring_begin(rq, 6);
if (IS_ERR(cs))
return PTR_ERR(cs);
*cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE; *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
/* FIXME(BDW): Address space and security selectors. */
*cs++ = MI_BATCH_BUFFER_START_GEN8 | *cs++ = MI_BATCH_BUFFER_START_GEN8 |
(flags & I915_DISPATCH_SECURE ? 0 : BIT(8)); (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
*cs++ = lower_32_bits(offset); *cs++ = lower_32_bits(offset);
@ -2316,7 +2334,8 @@ void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
engine->flags |= I915_ENGINE_SUPPORTS_STATS; engine->flags |= I915_ENGINE_SUPPORTS_STATS;
if (!intel_vgpu_active(engine->i915)) if (!intel_vgpu_active(engine->i915))
engine->flags |= I915_ENGINE_HAS_SEMAPHORES; engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
if (engine->preempt_context) if (engine->preempt_context &&
HAS_LOGICAL_RING_PREEMPTION(engine->i915))
engine->flags |= I915_ENGINE_HAS_PREEMPTION; engine->flags |= I915_ENGINE_HAS_PREEMPTION;
} }
@ -2350,7 +2369,10 @@ logical_ring_default_vfuncs(struct intel_engine_cs *engine)
* until a more refined solution exists. * until a more refined solution exists.
*/ */
} }
engine->emit_bb_start = gen8_emit_bb_start; if (IS_GEN(engine->i915, 8))
engine->emit_bb_start = gen8_emit_bb_start;
else
engine->emit_bb_start = gen9_emit_bb_start;
} }
static inline void static inline void

View File

@ -76,6 +76,185 @@ err_unlock:
return err; return err;
} }
static int live_busywait_preempt(void *arg)
{
struct drm_i915_private *i915 = arg;
struct i915_gem_context *ctx_hi, *ctx_lo;
struct intel_engine_cs *engine;
struct drm_i915_gem_object *obj;
struct i915_vma *vma;
enum intel_engine_id id;
intel_wakeref_t wakeref;
int err = -ENOMEM;
u32 *map;
/*
* Verify that even without HAS_LOGICAL_RING_PREEMPTION, we can
* preempt the busywaits used to synchronise between rings.
*/
mutex_lock(&i915->drm.struct_mutex);
wakeref = intel_runtime_pm_get(i915);
ctx_hi = kernel_context(i915);
if (!ctx_hi)
goto err_unlock;
ctx_hi->sched.priority = INT_MAX;
ctx_lo = kernel_context(i915);
if (!ctx_lo)
goto err_ctx_hi;
ctx_lo->sched.priority = INT_MIN;
obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
if (IS_ERR(obj)) {
err = PTR_ERR(obj);
goto err_ctx_lo;
}
map = i915_gem_object_pin_map(obj, I915_MAP_WC);
if (IS_ERR(map)) {
err = PTR_ERR(map);
goto err_obj;
}
vma = i915_vma_instance(obj, &i915->ggtt.vm, 0);
if (IS_ERR(vma)) {
err = PTR_ERR(vma);
goto err_map;
}
err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL);
if (err)
goto err_map;
for_each_engine(engine, i915, id) {
struct i915_request *lo, *hi;
struct igt_live_test t;
u32 *cs;
if (!intel_engine_can_store_dword(engine))
continue;
if (igt_live_test_begin(&t, i915, __func__, engine->name)) {
err = -EIO;
goto err_vma;
}
/*
* We create two requests. The low priority request
* busywaits on a semaphore (inside the ringbuffer where
* is should be preemptible) and the high priority requests
* uses a MI_STORE_DWORD_IMM to update the semaphore value
* allowing the first request to complete. If preemption
* fails, we hang instead.
*/
lo = i915_request_alloc(engine, ctx_lo);
if (IS_ERR(lo)) {
err = PTR_ERR(lo);
goto err_vma;
}
cs = intel_ring_begin(lo, 8);
if (IS_ERR(cs)) {
err = PTR_ERR(cs);
i915_request_add(lo);
goto err_vma;
}
*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
*cs++ = i915_ggtt_offset(vma);
*cs++ = 0;
*cs++ = 1;
/* XXX Do we need a flush + invalidate here? */
*cs++ = MI_SEMAPHORE_WAIT |
MI_SEMAPHORE_GLOBAL_GTT |
MI_SEMAPHORE_POLL |
MI_SEMAPHORE_SAD_EQ_SDD;
*cs++ = 0;
*cs++ = i915_ggtt_offset(vma);
*cs++ = 0;
intel_ring_advance(lo, cs);
i915_request_add(lo);
if (wait_for(READ_ONCE(*map), 10)) {
err = -ETIMEDOUT;
goto err_vma;
}
/* Low priority request should be busywaiting now */
if (i915_request_wait(lo, I915_WAIT_LOCKED, 1) != -ETIME) {
pr_err("%s: Busywaiting request did not!\n",
engine->name);
err = -EIO;
goto err_vma;
}
hi = i915_request_alloc(engine, ctx_hi);
if (IS_ERR(hi)) {
err = PTR_ERR(hi);
goto err_vma;
}
cs = intel_ring_begin(hi, 4);
if (IS_ERR(cs)) {
err = PTR_ERR(cs);
i915_request_add(hi);
goto err_vma;
}
*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
*cs++ = i915_ggtt_offset(vma);
*cs++ = 0;
*cs++ = 0;
intel_ring_advance(hi, cs);
i915_request_add(hi);
if (i915_request_wait(lo, I915_WAIT_LOCKED, HZ / 5) < 0) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("%s: Failed to preempt semaphore busywait!\n",
engine->name);
intel_engine_dump(engine, &p, "%s\n", engine->name);
GEM_TRACE_DUMP();
i915_gem_set_wedged(i915);
err = -EIO;
goto err_vma;
}
GEM_BUG_ON(READ_ONCE(*map));
if (igt_live_test_end(&t)) {
err = -EIO;
goto err_vma;
}
}
err = 0;
err_vma:
i915_vma_unpin(vma);
err_map:
i915_gem_object_unpin_map(obj);
err_obj:
i915_gem_object_put(obj);
err_ctx_lo:
kernel_context_close(ctx_lo);
err_ctx_hi:
kernel_context_close(ctx_hi);
err_unlock:
if (igt_flush_test(i915, I915_WAIT_LOCKED))
err = -EIO;
intel_runtime_pm_put(i915, wakeref);
mutex_unlock(&i915->drm.struct_mutex);
return err;
}
static int live_preempt(void *arg) static int live_preempt(void *arg)
{ {
struct drm_i915_private *i915 = arg; struct drm_i915_private *i915 = arg;
@ -1127,6 +1306,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
{ {
static const struct i915_subtest tests[] = { static const struct i915_subtest tests[] = {
SUBTEST(live_sanitycheck), SUBTEST(live_sanitycheck),
SUBTEST(live_busywait_preempt),
SUBTEST(live_preempt), SUBTEST(live_preempt),
SUBTEST(live_late_preempt), SUBTEST(live_late_preempt),
SUBTEST(live_suppress_self_preempt), SUBTEST(live_suppress_self_preempt),