drm/nouveau/fifo/gk104-: trigger mmu fault before attempting engine recovery

Greatly improves the chances of recovering the GPU from a CTXSW_TIMEOUT. Tested with piglit's arb_shader_image_load_store-atomicity, which causes GR to hang in such a way that recovery failed (CTXSW_TIMEOUT continually re-triggers). Signed-off-by: Ben Skeggs <bskeggs@redhat.com>
2017-01-18 15:37:24 +10:00 · 2017-01-18 15:37:24 +10:00 · 3ebef76a1d
parent 03f16f5f27
commit 3ebef76a1d
1 changed files with 41 additions and 0 deletions
--- a/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
+++ b/drivers/gpu/drm/nouveau/nvkm/engine/fifo/gk104.c
@ -280,10 +280,13 @@ gk104_fifo_recover_chan(struct nvkm_fifo *base, int chid)
 static void
 gk104_fifo_recover_engn(struct gk104_fifo *fifo, int engn)
 {
+	struct nvkm_engine *engine = fifo->engine[engn].engine;
 	struct nvkm_subdev *subdev = &fifo->base.engine.subdev;
+	struct nvkm_device *device = subdev->device;
 	const u32 runl = fifo->engine[engn].runl;
 	const u32 engm = BIT(engn);
 	struct gk104_fifo_engine_status status;
+	int mmui = -1;

 	assert_spin_locked(&fifo->base.lock);
 	if (fifo->recover.engm & engm)
@ -300,6 +303,44 @@ gk104_fifo_recover_engn(struct gk104_fifo *fifo, int engn)
 		gk104_fifo_recover_chan(&fifo->base, status.chan->id);
 	}

+	/* Determine MMU fault ID for the engine, if we're not being
+	 * called from the fault handler already.
+	 */
+	if (!status.faulted && engine) {
+		mmui = nvkm_top_fault_id(device, engine->subdev.index);
+		if (mmui < 0) {
+			const struct nvkm_enum *en = fifo->func->fault.engine;
+			for (; en && en->name; en++) {
+				if (en->data2 == engine->subdev.index) {
+					mmui = en->value;
+					break;
+				}
+			}
+		}
+		WARN_ON(mmui < 0);
+	}
+
+	/* Trigger a MMU fault for the engine.
+	 *
+	 * No good idea why this is needed, but nvgpu does something similar,
+	 * and it makes recovery from CTXSW_TIMEOUT a lot more reliable.
+	 */
+	if (mmui >= 0) {
+		nvkm_wr32(device, 0x002a30 + (engn * 0x04), 0x00000100 | mmui);
+
+		/* Wait for fault to trigger. */
+		nvkm_msec(device, 2000,
+			gk104_fifo_engine_status(fifo, engn, &status);
+			if (status.faulted)
+				break;
+		);
+
+		/* Release MMU fault trigger, and ACK the fault. */
+		nvkm_wr32(device, 0x002a30 + (engn * 0x04), 0x00000000);
+		nvkm_wr32(device, 0x00259c, BIT(mmui));
+		nvkm_wr32(device, 0x002100, 0x10000000);
+	}
+
 	/* Schedule recovery. */
 	nvkm_warn(subdev, "engine %d: scheduled for recovery\n", engn);
 	schedule_work(&fifo->recover.work);