drm/i915: Include GT/seqno activity in engine/hangcheck debugfs

Whilst investigating some mysterious failures with hangcheck not running during gem_busy/basic-hang-default, the question is why did we decide to cancel the retire_work (which queues the hangcheck)? That decision is based around GT activity, so include that information in the debug report. v2: Include the GT awake status in the error state Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170302150356.9713-1-chris@chris-wilson.co.uk Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
2017-03-02 15:03:56 +00:00 · 2017-03-02 15:03:56 +00:00 · f73b567462
parent 25afdf89ad
commit f73b567462
3 changed files with 17 additions and 4 deletions
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@ -1361,14 +1361,17 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 	} else
 		seq_printf(m, "Hangcheck inactive\n");

+	seq_printf(m, "GT active? %s\n", yesno(dev_priv->gt.awake));
+
 	for_each_engine(engine, dev_priv, id) {
 		struct intel_breadcrumbs *b = &engine->breadcrumbs;
 		struct rb_node *rb;

 		seq_printf(m, "%s:\n", engine->name);
-		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+		seq_printf(m, "\tseqno = %x [current %x, last %x], inflight %d\n",
 			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine));
+			   intel_engine_last_submit(engine),
+			   engine->timeline->inflight_seqnos);
 		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
@ -3253,6 +3256,11 @@ static int i915_engine_info(struct seq_file *m, void *unused)

 	intel_runtime_pm_get(dev_priv);

+	seq_printf(m, "GT awake? %s\n",
+		   yesno(dev_priv->gt.awake));
+	seq_printf(m, "Global active requests: %d\n",
+		   dev_priv->gt.active_requests);
+
 	for_each_engine(engine, dev_priv, id) {
 		struct intel_breadcrumbs *b = &engine->breadcrumbs;
 		struct drm_i915_gem_request *rq;
@ -3260,11 +3268,12 @@ static int i915_engine_info(struct seq_file *m, void *unused)
 		u64 addr;

 		seq_printf(m, "%s\n", engine->name);
-		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms]\n",
+		seq_printf(m, "\tcurrent seqno %x, last %x, hangcheck %x [%d ms], inflight %d\n",
 			   intel_engine_get_seqno(engine),
 			   intel_engine_last_submit(engine),
 			   engine->hangcheck.seqno,
-			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp));
+			   jiffies_to_msecs(jiffies - engine->hangcheck.action_timestamp),
+			   engine->timeline->inflight_seqnos);

 		rcu_read_lock();

--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@ -934,6 +934,7 @@ struct i915_gpu_state {

 	char error_msg[128];
 	bool simulated;
+	bool awake;
 	int iommu;
 	u32 reset_count;
 	u32 suspend_count;
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@ -632,6 +632,7 @@ int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
 			   CSR_VERSION_MINOR(csr->version));
 	}

+	err_printf(m, "GT awake: %s\n", yesno(error->awake));
 	err_printf(m, "EIR: 0x%08x\n", error->eir);
 	err_printf(m, "IER: 0x%08x\n", error->ier);
 	for (i = 0; i < error->ngtier; i++)
@ -1615,6 +1616,8 @@ static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
 static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
 				   struct i915_gpu_state *error)
 {
+	error->awake = dev_priv->gt.awake;
+
 	error->iommu = -1;
 #ifdef CONFIG_INTEL_IOMMU
 	error->iommu = intel_iommu_gfx_mapped;