OpenCloudOS-Kernel/drivers/gpu/drm/i915/i915_reset.h

/*
 * SPDX-License-Identifier: MIT
 *
 * Copyright © 2008-2018 Intel Corporation
 */

#ifndef I915_RESET_H
#define I915_RESET_H

#include <linux/compiler.h>
#include <linux/types.h>
#include <linux/srcu.h>

struct drm_i915_private;
struct intel_engine_cs;
struct intel_guc;

__printf(4, 5)
void i915_handle_error(struct drm_i915_private *i915,
		       u32 engine_mask,
		       unsigned long flags,
		       const char *fmt, ...);
#define I915_ERROR_CAPTURE BIT(0)

void i915_clear_error_registers(struct drm_i915_private *i915);

void i915_reset(struct drm_i915_private *i915,
		unsigned int stalled_mask,
		const char *reason);
int i915_reset_engine(struct intel_engine_cs *engine,
		      const char *reason);

void i915_reset_request(struct i915_request *rq, bool guilty);
bool i915_reset_flush(struct drm_i915_private *i915);

int __must_check i915_reset_trylock(struct drm_i915_private *i915);
void i915_reset_unlock(struct drm_i915_private *i915, int tag);

int i915_terminally_wedged(struct drm_i915_private *i915);

bool intel_has_gpu_reset(struct drm_i915_private *i915);
bool intel_has_reset_engine(struct drm_i915_private *i915);

int intel_gpu_reset(struct drm_i915_private *i915, u32 engine_mask);

int intel_reset_guc(struct drm_i915_private *i915);

struct i915_wedge_me {
	struct delayed_work work;
	struct drm_i915_private *i915;
	const char *name;
};

void __i915_init_wedge(struct i915_wedge_me *w,
		       struct drm_i915_private *i915,
		       long timeout,
		       const char *name);
void __i915_fini_wedge(struct i915_wedge_me *w);

#define i915_wedge_on_timeout(W, DEV, TIMEOUT)				\
	for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__);	\
	     (W)->i915;							\
	     __i915_fini_wedge((W)))

#endif /* I915_RESET_H */
drm/i915: Pull all the reset functionality together into i915_reset.c Currently the code to reset the GPU and our state is spread widely across a few files. Pull the logic together into a common file. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190116153304.787-1-chris@chris-wilson.co.uk 2019-01-16 23:33:04 +08:00			`/*`
			`* SPDX-License-Identifier: MIT`
			`*`
			`* Copyright © 2008-2018 Intel Corporation`
			`*/`

			`#ifndef I915_RESET_H`
			`#define I915_RESET_H`

			`#include <linux/compiler.h>`
			`#include <linux/types.h>`
drm/i915: Revoke mmaps and prevent access to fence registers across reset Previously, we were able to rely on the recursive properties of struct_mutex to allow us to serialise revoking mmaps and reacquiring the FENCE registers with them being clobbered over a global device reset. I then proceeded to throw out the baby with the bath water in order to pursue a struct_mutex-less reset. Perusing LWN for alternative strategies, the dilemma on how to serialise access to a global resource on one side was answered by https://lwn.net/Articles/202847/ -- Sleepable RCU: 1 int readside(void) { 2 int idx; 3 rcu_read_lock(); 4 if (nomoresrcu) { 5 rcu_read_unlock(); 6 return -EINVAL; 7 } 8 idx = srcu_read_lock(&ss); 9 rcu_read_unlock(); 10 /* SRCU read-side critical section. */ 11 srcu_read_unlock(&ss, idx); 12 return 0; 13 } 14 15 void cleanup(void) 16 { 17 nomoresrcu = 1; 18 synchronize_rcu(); 19 synchronize_srcu(&ss); 20 cleanup_srcu_struct(&ss); 21 } No more worrying about stop_machine, just an uber-complex mutex, optimised for reads, with the overhead pushed to the rare reset path. However, we do run the risk of a deadlock as we allocate underneath the SRCU read lock, and the allocation may require a GPU reset, causing a dependency cycle via the in-flight requests. We resolve that by declaring the driver wedged and cancelling all in-flight rendering. v2: Use expedited rcu barriers to match our earlier timing characteristics. v3: Try to annotate locking contexts for sparse v4: Reduce selftest lock duration to avoid a reset deadlock with fences v5: s/srcu/reset_backoff_srcu/ v6: Remove more stale comments Testcase: igt/gem_mmap_gtt/hang Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk 2019-02-08 23:37:03 +08:00			`#include <linux/srcu.h>`
drm/i915: Pull all the reset functionality together into i915_reset.c Currently the code to reset the GPU and our state is spread widely across a few files. Pull the logic together into a common file. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190116153304.787-1-chris@chris-wilson.co.uk 2019-01-16 23:33:04 +08:00
			`struct drm_i915_private;`
			`struct intel_engine_cs;`
			`struct intel_guc;`

			`__printf(4, 5)`
			`void i915_handle_error(struct drm_i915_private *i915,`
			`u32 engine_mask,`
			`unsigned long flags,`
			`const char *fmt, ...);`
			`#define I915_ERROR_CAPTURE BIT(0)`

			`void i915_clear_error_registers(struct drm_i915_private *i915);`

			`void i915_reset(struct drm_i915_private *i915,`
			`unsigned int stalled_mask,`
			`const char *reason);`
			`int i915_reset_engine(struct intel_engine_cs *engine,`
			`const char *reason);`

drm/i915: Remove GPU reset dependence on struct_mutex Now that the submission backends are controlled via their own spinlocks, with a wave of a magic wand we can lift the struct_mutex requirement around GPU reset. That is we allow the submission frontend (userspace) to keep on submitting while we process the GPU reset as we can suspend the backend independently. The major change is around the backoff/handoff strategy for performing the reset. With no mutex deadlock, we no longer have to coordinate with any waiter, and just perform the reset immediately. Testcase: igt/gem_mmap_gtt/hang # regresses Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190125132230.22221-3-chris@chris-wilson.co.uk 2019-01-25 21:22:28 +08:00			`void i915_reset_request(struct i915_request *rq, bool guilty);`
			`bool i915_reset_flush(struct drm_i915_private *i915);`

drm/i915: Revoke mmaps and prevent access to fence registers across reset Previously, we were able to rely on the recursive properties of struct_mutex to allow us to serialise revoking mmaps and reacquiring the FENCE registers with them being clobbered over a global device reset. I then proceeded to throw out the baby with the bath water in order to pursue a struct_mutex-less reset. Perusing LWN for alternative strategies, the dilemma on how to serialise access to a global resource on one side was answered by https://lwn.net/Articles/202847/ -- Sleepable RCU: 1 int readside(void) { 2 int idx; 3 rcu_read_lock(); 4 if (nomoresrcu) { 5 rcu_read_unlock(); 6 return -EINVAL; 7 } 8 idx = srcu_read_lock(&ss); 9 rcu_read_unlock(); 10 /* SRCU read-side critical section. */ 11 srcu_read_unlock(&ss, idx); 12 return 0; 13 } 14 15 void cleanup(void) 16 { 17 nomoresrcu = 1; 18 synchronize_rcu(); 19 synchronize_srcu(&ss); 20 cleanup_srcu_struct(&ss); 21 } No more worrying about stop_machine, just an uber-complex mutex, optimised for reads, with the overhead pushed to the rare reset path. However, we do run the risk of a deadlock as we allocate underneath the SRCU read lock, and the allocation may require a GPU reset, causing a dependency cycle via the in-flight requests. We resolve that by declaring the driver wedged and cancelling all in-flight rendering. v2: Use expedited rcu barriers to match our earlier timing characteristics. v3: Try to annotate locking contexts for sparse v4: Reduce selftest lock duration to avoid a reset deadlock with fences v5: s/srcu/reset_backoff_srcu/ v6: Remove more stale comments Testcase: igt/gem_mmap_gtt/hang Fixes: eb8d0f5af4ec ("drm/i915: Remove GPU reset dependence on struct_mutex") Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190208153708.20023-2-chris@chris-wilson.co.uk 2019-02-08 23:37:03 +08:00			`int __must_check i915_reset_trylock(struct drm_i915_private *i915);`
			`void i915_reset_unlock(struct drm_i915_private *i915, int tag);`

drm/i915: Beware temporary wedging when determining -EIO At a few points in our uABI, we check to see if the driver is wedged and report -EIO back to the user in that case. However, as we perform the check and reset asynchronously (where once before they were both serialised by the struct_mutex), we may instead see the temporary wedging used to cancel inflight rendering to avoid a deadlock during reset (caused by either us timing out in our reset handler, i915_wedge_on_timeout or with malice aforethought in intel_reset_prepare for a stuck modeset). If we suspect this is the case, that is we see a wedged driver and reset in progress, then wait until the reset is resolved before reporting upon the wedged status. v2: might_sleep() (Mika) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109580 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190220145637.23503-1-chris@chris-wilson.co.uk 2019-02-20 22:56:37 +08:00			`int i915_terminally_wedged(struct drm_i915_private *i915);`

drm/i915: Pull all the reset functionality together into i915_reset.c Currently the code to reset the GPU and our state is spread widely across a few files. Pull the logic together into a common file. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Acked-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190116153304.787-1-chris@chris-wilson.co.uk 2019-01-16 23:33:04 +08:00			`bool intel_has_gpu_reset(struct drm_i915_private *i915);`
			`bool intel_has_reset_engine(struct drm_i915_private *i915);`

			`int intel_gpu_reset(struct drm_i915_private *i915, u32 engine_mask);`

			`int intel_reset_guc(struct drm_i915_private *i915);`

			`struct i915_wedge_me {`
			`struct delayed_work work;`
			`struct drm_i915_private *i915;`
			`const char *name;`
			`};`

			`void __i915_init_wedge(struct i915_wedge_me *w,`
			`struct drm_i915_private *i915,`
			`long timeout,`
			`const char *name);`
			`void __i915_fini_wedge(struct i915_wedge_me *w);`

			`#define i915_wedge_on_timeout(W, DEV, TIMEOUT) \`
			`for (__i915_init_wedge((W), (DEV), (TIMEOUT), __func__); \`
			`(W)->i915; \`
			`__i915_fini_wedge((W)))`

			`#endif /* I915_RESET_H */`