2013-07-12 21:50:57 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2008 Intel Corporation
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
* Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
* Keith Packard <keithp@keithp.com>
|
|
|
|
* Mika Kuoppala <mika.kuoppala@intel.com>
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <generated/utsrelease.h>
|
2016-10-12 17:05:19 +08:00
|
|
|
#include <linux/stop_machine.h>
|
2016-10-12 17:05:22 +08:00
|
|
|
#include <linux/zlib.h>
|
2013-07-12 21:50:57 +08:00
|
|
|
#include "i915_drv.h"
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static const char *engine_str(int engine)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
switch (engine) {
|
2013-07-12 21:50:57 +08:00
|
|
|
case RCS: return "render";
|
|
|
|
case VCS: return "bsd";
|
|
|
|
case BCS: return "blt";
|
|
|
|
case VECS: return "vebox";
|
2014-04-17 10:37:37 +08:00
|
|
|
case VCS2: return "bsd2";
|
2013-07-12 21:50:57 +08:00
|
|
|
default: return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *tiling_flag(int tiling)
|
|
|
|
{
|
|
|
|
switch (tiling) {
|
|
|
|
default:
|
|
|
|
case I915_TILING_NONE: return "";
|
|
|
|
case I915_TILING_X: return " X";
|
|
|
|
case I915_TILING_Y: return " Y";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *dirty_flag(int dirty)
|
|
|
|
{
|
|
|
|
return dirty ? " dirty" : "";
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *purgeable_flag(int purgeable)
|
|
|
|
{
|
|
|
|
return purgeable ? " purgeable" : "";
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __i915_error_ok(struct drm_i915_error_state_buf *e)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!e->err && WARN(e->bytes > (e->size - 1), "overflow")) {
|
|
|
|
e->err = -ENOSPC;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (e->bytes == e->size - 1 || e->err)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __i915_error_seek(struct drm_i915_error_state_buf *e,
|
|
|
|
unsigned len)
|
|
|
|
{
|
|
|
|
if (e->pos + len <= e->start) {
|
|
|
|
e->pos += len;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* First vsnprintf needs to fit in its entirety for memmove */
|
|
|
|
if (len >= e->size) {
|
|
|
|
e->err = -EIO;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __i915_error_advance(struct drm_i915_error_state_buf *e,
|
|
|
|
unsigned len)
|
|
|
|
{
|
|
|
|
/* If this is first printf in this window, adjust it so that
|
|
|
|
* start position matches start of the buffer
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (e->pos < e->start) {
|
|
|
|
const size_t off = e->start - e->pos;
|
|
|
|
|
|
|
|
/* Should not happen but be paranoid */
|
|
|
|
if (off > len || e->bytes) {
|
|
|
|
e->err = -EIO;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
memmove(e->buf, e->buf + off, len - off);
|
|
|
|
e->bytes = len - off;
|
|
|
|
e->pos = e->start;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
e->bytes += len;
|
|
|
|
e->pos += len;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
|
|
|
|
const char *f, va_list args)
|
|
|
|
{
|
|
|
|
unsigned len;
|
|
|
|
|
|
|
|
if (!__i915_error_ok(e))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Seek the first printf which is hits start position */
|
|
|
|
if (e->pos < e->start) {
|
2013-09-20 17:20:59 +08:00
|
|
|
va_list tmp;
|
|
|
|
|
|
|
|
va_copy(tmp, args);
|
2014-02-07 23:40:50 +08:00
|
|
|
len = vsnprintf(NULL, 0, f, tmp);
|
|
|
|
va_end(tmp);
|
|
|
|
|
|
|
|
if (!__i915_error_seek(e, len))
|
2013-07-12 21:50:57 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = vsnprintf(e->buf + e->bytes, e->size - e->bytes, f, args);
|
|
|
|
if (len >= e->size - e->bytes)
|
|
|
|
len = e->size - e->bytes - 1;
|
|
|
|
|
|
|
|
__i915_error_advance(e, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void i915_error_puts(struct drm_i915_error_state_buf *e,
|
|
|
|
const char *str)
|
|
|
|
{
|
|
|
|
unsigned len;
|
|
|
|
|
|
|
|
if (!__i915_error_ok(e))
|
|
|
|
return;
|
|
|
|
|
|
|
|
len = strlen(str);
|
|
|
|
|
|
|
|
/* Seek the first printf which is hits start position */
|
|
|
|
if (e->pos < e->start) {
|
|
|
|
if (!__i915_error_seek(e, len))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len >= e->size - e->bytes)
|
|
|
|
len = e->size - e->bytes - 1;
|
|
|
|
memcpy(e->buf + e->bytes, str, len);
|
|
|
|
|
|
|
|
__i915_error_advance(e, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
|
|
|
|
#define err_puts(e, s) i915_error_puts(e, s)
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
|
|
|
|
|
|
|
|
static bool compress_init(struct z_stream_s *zstream)
|
|
|
|
{
|
|
|
|
memset(zstream, 0, sizeof(*zstream));
|
|
|
|
|
|
|
|
zstream->workspace =
|
|
|
|
kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
|
|
|
|
GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
if (!zstream->workspace)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
|
|
|
|
kfree(zstream->workspace);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compress_page(struct z_stream_s *zstream,
|
|
|
|
void *src,
|
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
|
|
|
zstream->next_in = src;
|
|
|
|
zstream->avail_in = PAGE_SIZE;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (zstream->avail_out == 0) {
|
|
|
|
unsigned long page;
|
|
|
|
|
|
|
|
page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
dst->pages[dst->page_count++] = (void *)page;
|
|
|
|
|
|
|
|
zstream->next_out = (void *)page;
|
|
|
|
zstream->avail_out = PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zlib_deflate(zstream, Z_SYNC_FLUSH) != Z_OK)
|
|
|
|
return -EIO;
|
|
|
|
} while (zstream->avail_in);
|
|
|
|
|
|
|
|
/* Fallback to uncompressed if we increase size? */
|
|
|
|
if (0 && zstream->total_out > zstream->total_in)
|
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void compress_fini(struct z_stream_s *zstream,
|
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
|
|
|
if (dst) {
|
|
|
|
zlib_deflate(zstream, Z_FINISH);
|
|
|
|
dst->unused = zstream->avail_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
zlib_deflateEnd(zstream);
|
|
|
|
kfree(zstream->workspace);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void err_compression_marker(struct drm_i915_error_state_buf *m)
|
|
|
|
{
|
|
|
|
err_puts(m, ":");
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static bool compress_init(struct z_stream_s *zstream)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int compress_page(struct z_stream_s *zstream,
|
|
|
|
void *src,
|
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
|
|
|
unsigned long page;
|
|
|
|
|
|
|
|
page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
dst->pages[dst->page_count++] =
|
|
|
|
memcpy((void *)page, src, PAGE_SIZE);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void compress_fini(struct z_stream_s *zstream,
|
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void err_compression_marker(struct drm_i915_error_state_buf *m)
|
|
|
|
{
|
|
|
|
err_puts(m, "~");
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
static void print_error_buffers(struct drm_i915_error_state_buf *m,
|
|
|
|
const char *name,
|
|
|
|
struct drm_i915_error_buffer *err,
|
|
|
|
int count)
|
|
|
|
{
|
2015-04-27 20:41:17 +08:00
|
|
|
int i;
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
err_printf(m, "%s [%d]:\n", name, count);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
while (count--) {
|
2015-07-30 00:23:56 +08:00
|
|
|
err_printf(m, " %08x_%08x %8u %02x %02x [ ",
|
|
|
|
upper_32_bits(err->gtt_offset),
|
|
|
|
lower_32_bits(err->gtt_offset),
|
2013-07-12 21:50:57 +08:00
|
|
|
err->size,
|
|
|
|
err->read_domains,
|
2015-04-27 20:41:17 +08:00
|
|
|
err->write_domain);
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++)
|
2015-04-27 20:41:17 +08:00
|
|
|
err_printf(m, "%02x ", err->rseqno[i]);
|
|
|
|
|
|
|
|
err_printf(m, "] %02x", err->wseqno);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_puts(m, tiling_flag(err->tiling));
|
|
|
|
err_puts(m, dirty_flag(err->dirty));
|
|
|
|
err_puts(m, purgeable_flag(err->purgeable));
|
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl
By exporting the ability to map user address and inserting PTEs
representing their backing pages into the GTT, we can exploit UMA in order
to utilize normal application data as a texture source or even as a
render target (depending upon the capabilities of the chipset). This has
a number of uses, with zero-copy downloads to the GPU and efficient
readback making the intermixed streaming of CPU and GPU operations
fairly efficient. This ability has many widespread implications from
faster rendering of client-side software rasterisers (chromium),
mitigation of stalls due to read back (firefox) and to faster pipelining
of texture data (such as pixel buffer objects in GL or data blobs in CL).
v2: Compile with CONFIG_MMU_NOTIFIER
v3: We can sleep while performing invalidate-range, which we can utilise
to drop our page references prior to the kernel manipulating the vma
(for either discard or cloning) and so protect normal users.
v4: Only run the invalidate notifier if the range intercepts the bo.
v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers
v6: Recheck after reacquire mutex for lost mmu.
v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary.
v8: Fix rebasing error after forwarding porting the back port.
v9: Limit the userptr to page aligned entries. We now expect userspace
to handle all the offset-in-page adjustments itself.
v10: Prevent vma from being copied across fork to avoid issues with cow.
v11: Drop vma behaviour changes -- locking is nigh on impossible.
Use a worker to load user pages to avoid lock inversions.
v12: Use get_task_mm()/mmput() for correct refcounting of mm.
v13: Use a worker to release the mmu_notifier to avoid lock inversion
v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer
with its own locking and tree of objects for each mm/mmu_notifier.
v15: Prevent overlapping userptr objects, and invalidate all objects
within the mmu_notifier range
v16: Fix a typo for iterating over multiple objects in the range and
rearrange error path to destroy the mmu_notifier locklessly.
Also close a race between invalidate_range and the get_pages_worker.
v17: Close a race between get_pages_worker/invalidate_range and fresh
allocations of the same userptr range - and notice that
struct_mutex was presumed to be held when during creation it wasn't.
v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory
for the struct sg_table and to clear it before reporting an error.
v19: Always error out on read-only userptr requests as we don't have the
hardware infrastructure to support them at the moment.
v20: Refuse to implement read-only support until we have the required
infrastructure - but reserve the bit in flags for future use.
v21: use_mm() is not required for get_user_pages(). It is only meant to
be used to fix up the kernel thread's current->mm for use with
copy_user().
v22: Use sg_alloc_table_from_pages for that chunky feeling
v23: Export a function for sanity checking dma-buf rather than encode
userptr details elsewhere, and clean up comments based on
suggestions by Bradley.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Akash Goel <akash.goel@intel.com>
Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com>
[danvet: Frob ioctl allocation to pick the next one - will cause a bit
of fuss with create2 apparently, but such are the rules.]
[danvet2: oops, forgot to git add after manual patch application]
[danvet3: Appease sparse.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
|
|
|
err_puts(m, err->userptr ? " userptr" : "");
|
2016-07-27 16:07:28 +08:00
|
|
|
err_puts(m, err->engine != -1 ? " " : "");
|
|
|
|
err_puts(m, engine_str(err->engine));
|
2014-08-22 21:41:39 +08:00
|
|
|
err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
if (err->name)
|
|
|
|
err_printf(m, " (name: %d)", err->name);
|
|
|
|
if (err->fence_reg != I915_FENCE_REG_NONE)
|
|
|
|
err_printf(m, " (fence: %d)", err->fence_reg);
|
|
|
|
|
|
|
|
err_puts(m, "\n");
|
|
|
|
err++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-08-03 05:50:21 +08:00
|
|
|
static const char *hangcheck_action_to_str(enum intel_engine_hangcheck_action a)
|
2013-09-06 21:03:28 +08:00
|
|
|
{
|
|
|
|
switch (a) {
|
|
|
|
case HANGCHECK_IDLE:
|
|
|
|
return "idle";
|
|
|
|
case HANGCHECK_WAIT:
|
|
|
|
return "wait";
|
|
|
|
case HANGCHECK_ACTIVE:
|
|
|
|
return "active";
|
|
|
|
case HANGCHECK_KICK:
|
|
|
|
return "kick";
|
|
|
|
case HANGCHECK_HUNG:
|
|
|
|
return "hung";
|
|
|
|
}
|
|
|
|
|
|
|
|
return "unknown";
|
|
|
|
}
|
|
|
|
|
2016-09-20 21:54:32 +08:00
|
|
|
static void error_print_instdone(struct drm_i915_error_state_buf *m,
|
|
|
|
struct drm_i915_error_engine *ee)
|
|
|
|
{
|
2016-09-20 21:54:33 +08:00
|
|
|
int slice;
|
|
|
|
int subslice;
|
|
|
|
|
2016-09-20 21:54:32 +08:00
|
|
|
err_printf(m, " INSTDONE: 0x%08x\n",
|
|
|
|
ee->instdone.instdone);
|
|
|
|
|
|
|
|
if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
err_printf(m, " SC_INSTDONE: 0x%08x\n",
|
|
|
|
ee->instdone.slice_common);
|
|
|
|
|
|
|
|
if (INTEL_GEN(m->i915) <= 6)
|
|
|
|
return;
|
|
|
|
|
2016-09-20 21:54:33 +08:00
|
|
|
for_each_instdone_slice_subslice(m->i915, slice, subslice)
|
|
|
|
err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
|
|
|
|
slice, subslice,
|
|
|
|
ee->instdone.sampler[slice][subslice]);
|
|
|
|
|
|
|
|
for_each_instdone_slice_subslice(m->i915, slice, subslice)
|
|
|
|
err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
|
|
|
|
slice, subslice,
|
|
|
|
ee->instdone.row[slice][subslice]);
|
2016-09-20 21:54:32 +08:00
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
static void error_print_request(struct drm_i915_error_state_buf *m,
|
|
|
|
const char *prefix,
|
|
|
|
struct drm_i915_error_request *erq)
|
|
|
|
{
|
|
|
|
if (!erq->seqno)
|
|
|
|
return;
|
|
|
|
|
|
|
|
err_printf(m, "%s pid %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
|
|
|
|
prefix, erq->pid,
|
|
|
|
erq->context, erq->seqno,
|
|
|
|
jiffies_to_msecs(jiffies - erq->jiffies),
|
|
|
|
erq->head, erq->tail);
|
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void error_print_engine(struct drm_i915_error_state_buf *m,
|
|
|
|
struct drm_i915_error_engine *ee)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, "%s command stream:\n", engine_str(ee->engine_id));
|
|
|
|
err_printf(m, " START: 0x%08x\n", ee->start);
|
2016-10-05 04:11:30 +08:00
|
|
|
err_printf(m, " HEAD: 0x%08x\n [0x%08x]", ee->head, ee->rq_head);
|
|
|
|
err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
|
|
|
|
ee->tail, ee->rq_post, ee->rq_tail);
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " CTL: 0x%08x\n", ee->ctl);
|
2016-08-15 17:49:11 +08:00
|
|
|
err_printf(m, " MODE: 0x%08x\n", ee->mode);
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " HWS: 0x%08x\n", ee->hws);
|
|
|
|
err_printf(m, " ACTHD: 0x%08x %08x\n",
|
|
|
|
(u32)(ee->acthd>>32), (u32)ee->acthd);
|
|
|
|
err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
|
|
|
|
err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
|
2016-09-20 21:54:32 +08:00
|
|
|
|
|
|
|
error_print_instdone(m, ee);
|
|
|
|
|
2016-08-15 17:49:09 +08:00
|
|
|
if (ee->batchbuffer) {
|
|
|
|
u64 start = ee->batchbuffer->gtt_offset;
|
|
|
|
u64 end = start + ee->batchbuffer->gtt_size;
|
|
|
|
|
|
|
|
err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
|
|
|
|
upper_32_bits(start), lower_32_bits(start),
|
|
|
|
upper_32_bits(end), lower_32_bits(end));
|
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
if (INTEL_GEN(m->i915) >= 4) {
|
2016-08-15 17:49:09 +08:00
|
|
|
err_printf(m, " BBADDR: 0x%08x_%08x\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
(u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
|
|
|
|
err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
|
|
|
|
err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
|
2013-12-11 03:44:43 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
|
|
|
|
err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
|
|
|
|
lower_32_bits(ee->faddr));
|
|
|
|
if (INTEL_GEN(m->i915) >= 6) {
|
|
|
|
err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
|
|
|
|
err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, " SYNC_0: 0x%08x [last synced 0x%08x]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[0],
|
|
|
|
ee->semaphore_seqno[0]);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, " SYNC_1: 0x%08x [last synced 0x%08x]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[1],
|
|
|
|
ee->semaphore_seqno[1]);
|
|
|
|
if (HAS_VEBOX(m->i915)) {
|
2013-08-13 07:53:04 +08:00
|
|
|
err_printf(m, " SYNC_2: 0x%08x [last synced 0x%08x]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[2],
|
|
|
|
ee->semaphore_seqno[2]);
|
2013-08-13 07:53:04 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
if (USES_PPGTT(m->i915)) {
|
|
|
|
err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
|
2014-01-30 16:19:40 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (INTEL_GEN(m->i915) >= 8) {
|
2014-01-30 16:19:40 +08:00
|
|
|
int i;
|
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
err_printf(m, " PDP%d: 0x%016llx\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
i, ee->vm_info.pdp[i]);
|
2014-01-30 16:19:40 +08:00
|
|
|
} else {
|
|
|
|
err_printf(m, " PP_DIR_BASE: 0x%08x\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pp_dir_base);
|
2014-01-30 16:19:40 +08:00
|
|
|
}
|
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " seqno: 0x%08x\n", ee->seqno);
|
|
|
|
err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno);
|
|
|
|
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
|
|
|
|
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
|
|
|
|
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
|
2013-09-06 21:03:28 +08:00
|
|
|
err_printf(m, " hangcheck: %s [%d]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
hangcheck_action_to_str(ee->hangcheck_action),
|
|
|
|
ee->hangcheck_score);
|
2016-10-13 18:18:14 +08:00
|
|
|
error_print_request(m, " ELSP[0]: ", &ee->execlist[0]);
|
|
|
|
error_print_request(m, " ELSP[1]: ", &ee->execlist[1]);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
|
|
|
|
{
|
|
|
|
va_list args;
|
|
|
|
|
|
|
|
va_start(args, f);
|
|
|
|
i915_error_vprintf(e, f, args);
|
|
|
|
va_end(args);
|
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
static int
|
|
|
|
ascii85_encode_len(int len)
|
|
|
|
{
|
|
|
|
return DIV_ROUND_UP(len, 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
ascii85_encode(u32 in, char *out)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (in == 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
out[5] = '\0';
|
|
|
|
for (i = 5; i--; ) {
|
|
|
|
out[i] = '!' + in % 85;
|
|
|
|
in /= 85;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:24 +08:00
|
|
|
static void print_error_obj(struct drm_i915_error_state_buf *m,
|
2016-10-12 17:05:21 +08:00
|
|
|
struct intel_engine_cs *engine,
|
|
|
|
const char *name,
|
2014-02-25 23:11:24 +08:00
|
|
|
struct drm_i915_error_object *obj)
|
|
|
|
{
|
2016-10-12 17:05:22 +08:00
|
|
|
char out[6];
|
|
|
|
int page;
|
2014-02-25 23:11:24 +08:00
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
if (!obj)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (name) {
|
|
|
|
err_printf(m, "%s --- %s = 0x%08x %08x\n",
|
|
|
|
engine ? engine->name : "global", name,
|
|
|
|
upper_32_bits(obj->gtt_offset),
|
|
|
|
lower_32_bits(obj->gtt_offset));
|
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
err_compression_marker(m);
|
|
|
|
for (page = 0; page < obj->page_count; page++) {
|
|
|
|
int i, len;
|
|
|
|
|
|
|
|
len = PAGE_SIZE;
|
|
|
|
if (page == obj->page_count - 1)
|
|
|
|
len -= obj->unused;
|
|
|
|
len = ascii85_encode_len(len);
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
if (ascii85_encode(obj->pages[page][i], out))
|
|
|
|
err_puts(m, out);
|
|
|
|
else
|
|
|
|
err_puts(m, "z");
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
}
|
2016-10-12 17:05:22 +08:00
|
|
|
err_puts(m, "\n");
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:45 +08:00
|
|
|
static void err_print_capabilities(struct drm_i915_error_state_buf *m,
|
|
|
|
const struct intel_device_info *info)
|
|
|
|
{
|
|
|
|
#define PRINT_FLAG(x) err_printf(m, #x ": %s\n", yesno(info->x))
|
2016-10-05 18:50:16 +08:00
|
|
|
DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG);
|
2016-08-15 17:48:45 +08:00
|
|
|
#undef PRINT_FLAG
|
|
|
|
}
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
|
|
|
|
const struct i915_error_state_file_priv *error_priv)
|
|
|
|
{
|
|
|
|
struct drm_device *dev = error_priv->dev;
|
2016-07-04 18:34:36 +08:00
|
|
|
struct drm_i915_private *dev_priv = to_i915(dev);
|
2016-08-22 18:32:44 +08:00
|
|
|
struct pci_dev *pdev = dev_priv->drm.pdev;
|
2013-07-12 21:50:57 +08:00
|
|
|
struct drm_i915_error_state *error = error_priv->error;
|
2014-07-01 00:53:41 +08:00
|
|
|
struct drm_i915_error_object *obj;
|
2014-02-25 23:11:24 +08:00
|
|
|
int max_hangcheck_score;
|
2016-10-12 17:05:21 +08:00
|
|
|
int i, j;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
if (!error) {
|
|
|
|
err_printf(m, "no error state collected\n");
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:25 +08:00
|
|
|
err_printf(m, "%s\n", error->error_msg);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "Time: %ld s %ld us\n", error->time.tv_sec,
|
|
|
|
error->time.tv_usec);
|
|
|
|
err_printf(m, "Kernel: " UTS_RELEASE "\n");
|
2016-08-15 17:48:45 +08:00
|
|
|
err_print_capabilities(m, &error->device_info);
|
2014-02-25 23:11:24 +08:00
|
|
|
max_hangcheck_score = 0;
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
if (error->engine[i].hangcheck_score > max_hangcheck_score)
|
|
|
|
max_hangcheck_score = error->engine[i].hangcheck_score;
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
if (error->engine[i].hangcheck_score == max_hangcheck_score &&
|
|
|
|
error->engine[i].pid != -1) {
|
2014-02-25 23:11:24 +08:00
|
|
|
err_printf(m, "Active process (on ring %s): %s [%d]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
engine_str(i),
|
|
|
|
error->engine[i].comm,
|
|
|
|
error->engine[i].pid);
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
}
|
2014-02-25 23:11:27 +08:00
|
|
|
err_printf(m, "Reset count: %u\n", error->reset_count);
|
2014-02-25 23:11:28 +08:00
|
|
|
err_printf(m, "Suspend count: %u\n", error->suspend_count);
|
2016-08-22 18:32:44 +08:00
|
|
|
err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
|
|
|
|
err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
|
2016-01-29 01:18:41 +08:00
|
|
|
err_printf(m, "PCI Subsystem: %04x:%04x\n",
|
2016-08-22 18:32:44 +08:00
|
|
|
pdev->subsystem_vendor,
|
|
|
|
pdev->subsystem_device);
|
2015-08-08 03:24:15 +08:00
|
|
|
err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
|
2015-10-29 21:21:19 +08:00
|
|
|
|
|
|
|
if (HAS_CSR(dev)) {
|
|
|
|
struct intel_csr *csr = &dev_priv->csr;
|
|
|
|
|
|
|
|
err_printf(m, "DMC loaded: %s\n",
|
|
|
|
yesno(csr->dmc_payload != NULL));
|
|
|
|
err_printf(m, "DMC fw version: %d.%d\n",
|
|
|
|
CSR_VERSION_MAJOR(csr->version),
|
|
|
|
CSR_VERSION_MINOR(csr->version));
|
|
|
|
}
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "EIR: 0x%08x\n", error->eir);
|
|
|
|
err_printf(m, "IER: 0x%08x\n", error->ier);
|
2014-08-06 01:07:13 +08:00
|
|
|
if (INTEL_INFO(dev)->gen >= 8) {
|
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
err_printf(m, "GTIER gt %d: 0x%08x\n", i,
|
|
|
|
error->gtier[i]);
|
|
|
|
} else if (HAS_PCH_SPLIT(dev) || IS_VALLEYVIEW(dev))
|
|
|
|
err_printf(m, "GTIER: 0x%08x\n", error->gtier[0]);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
|
|
|
|
err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
|
|
|
|
err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
|
|
|
|
err_printf(m, "CCID: 0x%08x\n", error->ccid);
|
2013-09-26 00:34:55 +08:00
|
|
|
err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
|
|
|
err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
|
|
|
|
|
|
|
|
if (INTEL_INFO(dev)->gen >= 6) {
|
|
|
|
err_printf(m, "ERROR: 0x%08x\n", error->error);
|
2015-03-24 20:54:19 +08:00
|
|
|
|
|
|
|
if (INTEL_INFO(dev)->gen >= 8)
|
|
|
|
err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
|
|
|
|
error->fault_data1, error->fault_data0);
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
|
|
|
|
}
|
|
|
|
|
2016-05-10 17:57:06 +08:00
|
|
|
if (IS_GEN7(dev))
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
if (error->engine[i].engine_id != -1)
|
|
|
|
error_print_engine(m, &error->engine[i]);
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
|
|
|
|
char buf[128];
|
|
|
|
int len, first = 1;
|
2014-08-13 03:05:47 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
if (!error->active_vm[i])
|
|
|
|
break;
|
|
|
|
|
|
|
|
len = scnprintf(buf, sizeof(buf), "Active (");
|
|
|
|
for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
|
|
|
|
if (error->engine[j].vm != error->active_vm[i])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
len += scnprintf(buf + len, sizeof(buf), "%s%s",
|
|
|
|
first ? "" : ", ",
|
|
|
|
dev_priv->engine[j].name);
|
|
|
|
first = 0;
|
|
|
|
}
|
|
|
|
scnprintf(buf + len, sizeof(buf), ")");
|
|
|
|
print_error_buffers(m, buf,
|
2014-08-13 03:05:47 +08:00
|
|
|
error->active_bo[i],
|
|
|
|
error->active_bo_count[i]);
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
print_error_buffers(m, "Pinned (global)",
|
|
|
|
error->pinned_bo,
|
|
|
|
error->pinned_bo_count);
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
|
|
|
|
|
|
|
obj = ee->batchbuffer;
|
2014-02-25 23:11:24 +08:00
|
|
|
if (obj) {
|
2016-03-16 19:00:38 +08:00
|
|
|
err_puts(m, dev_priv->engine[i].name);
|
2016-07-27 16:07:28 +08:00
|
|
|
if (ee->pid != -1)
|
2014-02-25 23:11:24 +08:00
|
|
|
err_printf(m, " (submitted by %s [%d])",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->comm,
|
|
|
|
ee->pid);
|
2015-07-30 00:23:56 +08:00
|
|
|
err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
|
|
|
|
upper_32_bits(obj->gtt_offset),
|
|
|
|
lower_32_bits(obj->gtt_offset));
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, &dev_priv->engine[i], NULL, obj);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (ee->num_requests) {
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "%s --- %d requests\n",
|
2016-03-16 19:00:38 +08:00
|
|
|
dev_priv->engine[i].name,
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->num_requests);
|
2016-10-13 18:18:14 +08:00
|
|
|
for (j = 0; j < ee->num_requests; j++)
|
|
|
|
error_print_request(m, " ", &ee->requests[j]);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
if (IS_ERR(ee->waiters)) {
|
|
|
|
err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
|
|
|
|
dev_priv->engine[i].name);
|
|
|
|
} else if (ee->num_waiters) {
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
err_printf(m, "%s --- %d waiters\n",
|
|
|
|
dev_priv->engine[i].name,
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->num_waiters);
|
|
|
|
for (j = 0; j < ee->num_waiters; j++) {
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
err_printf(m, " seqno 0x%08x for %s [%d]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->waiters[j].seqno,
|
|
|
|
ee->waiters[j].comm,
|
|
|
|
ee->waiters[j].pid);
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, &dev_priv->engine[i],
|
|
|
|
"ringbuffer", ee->ringbuffer);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, &dev_priv->engine[i],
|
|
|
|
"HW Status", ee->hws_page);
|
2015-09-16 01:03:01 +08:00
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, &dev_priv->engine[i],
|
|
|
|
"HW context", ee->ctx);
|
2014-01-24 06:40:36 +08:00
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, &dev_priv->engine[i],
|
|
|
|
"WA context", ee->wa_ctx);
|
2016-03-01 19:24:36 +08:00
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, &dev_priv->engine[i],
|
|
|
|
"WA batchbuffer", ee->wa_batchbuffer);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, NULL, "Semaphores", error->semaphore);
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
if (error->overlay)
|
|
|
|
intel_overlay_print_error_state(m, error->overlay);
|
|
|
|
|
|
|
|
if (error->display)
|
|
|
|
intel_display_print_error_state(m, dev, error->display);
|
|
|
|
|
|
|
|
out:
|
|
|
|
if (m->bytes == 0 && m->err)
|
|
|
|
return m->err;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int i915_error_state_buf_init(struct drm_i915_error_state_buf *ebuf,
|
2014-08-22 21:41:39 +08:00
|
|
|
struct drm_i915_private *i915,
|
2013-07-12 21:50:57 +08:00
|
|
|
size_t count, loff_t pos)
|
|
|
|
{
|
|
|
|
memset(ebuf, 0, sizeof(*ebuf));
|
2014-08-22 21:41:39 +08:00
|
|
|
ebuf->i915 = i915;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
/* We need to have enough room to store any i915_error_state printf
|
|
|
|
* so that we can move it to start position.
|
|
|
|
*/
|
|
|
|
ebuf->size = count + 1 > PAGE_SIZE ? count + 1 : PAGE_SIZE;
|
|
|
|
ebuf->buf = kmalloc(ebuf->size,
|
|
|
|
GFP_TEMPORARY | __GFP_NORETRY | __GFP_NOWARN);
|
|
|
|
|
|
|
|
if (ebuf->buf == NULL) {
|
|
|
|
ebuf->size = PAGE_SIZE;
|
|
|
|
ebuf->buf = kmalloc(ebuf->size, GFP_TEMPORARY);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ebuf->buf == NULL) {
|
|
|
|
ebuf->size = 128;
|
|
|
|
ebuf->buf = kmalloc(ebuf->size, GFP_TEMPORARY);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ebuf->buf == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ebuf->start = pos;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void i915_error_object_free(struct drm_i915_error_object *obj)
|
|
|
|
{
|
|
|
|
int page;
|
|
|
|
|
|
|
|
if (obj == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (page = 0; page < obj->page_count; page++)
|
2016-10-12 17:05:20 +08:00
|
|
|
free_page((unsigned long)obj->pages[page]);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
kfree(obj);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void i915_error_state_free(struct kref *error_ref)
|
|
|
|
{
|
|
|
|
struct drm_i915_error_state *error = container_of(error_ref,
|
|
|
|
typeof(*error), ref);
|
|
|
|
int i;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
|
|
|
|
|
|
|
i915_error_object_free(ee->batchbuffer);
|
|
|
|
i915_error_object_free(ee->wa_batchbuffer);
|
|
|
|
i915_error_object_free(ee->ringbuffer);
|
|
|
|
i915_error_object_free(ee->hws_page);
|
|
|
|
i915_error_object_free(ee->ctx);
|
|
|
|
i915_error_object_free(ee->wa_ctx);
|
|
|
|
|
|
|
|
kfree(ee->requests);
|
2016-09-06 15:38:44 +08:00
|
|
|
if (!IS_ERR_OR_NULL(ee->waiters))
|
|
|
|
kfree(ee->waiters);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:49:02 +08:00
|
|
|
i915_error_object_free(error->semaphore);
|
2015-03-20 17:41:03 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
|
2015-03-20 17:41:03 +08:00
|
|
|
kfree(error->active_bo[i]);
|
|
|
|
kfree(error->pinned_bo);
|
2016-08-15 17:48:41 +08:00
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
kfree(error->overlay);
|
|
|
|
kfree(error->display);
|
|
|
|
kfree(error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct drm_i915_error_object *
|
2016-10-12 17:05:20 +08:00
|
|
|
i915_error_object_create(struct drm_i915_private *i915,
|
2016-08-15 17:49:06 +08:00
|
|
|
struct i915_vma *vma)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-10-12 17:05:20 +08:00
|
|
|
struct i915_ggtt *ggtt = &i915->ggtt;
|
|
|
|
const u64 slot = ggtt->error_capture.start;
|
2013-07-12 21:50:57 +08:00
|
|
|
struct drm_i915_error_object *dst;
|
2016-10-12 17:05:22 +08:00
|
|
|
struct z_stream_s zstream;
|
2016-10-12 17:05:20 +08:00
|
|
|
unsigned long num_pages;
|
|
|
|
struct sgt_iter iter;
|
|
|
|
dma_addr_t dma;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:49:06 +08:00
|
|
|
if (!vma)
|
|
|
|
return NULL;
|
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
|
2016-10-12 17:05:22 +08:00
|
|
|
num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
|
2016-10-12 17:05:20 +08:00
|
|
|
dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
|
|
|
|
GFP_ATOMIC | __GFP_NOWARN);
|
2016-08-15 17:49:06 +08:00
|
|
|
if (!dst)
|
2013-07-12 21:50:57 +08:00
|
|
|
return NULL;
|
|
|
|
|
2016-08-15 17:49:09 +08:00
|
|
|
dst->gtt_offset = vma->node.start;
|
|
|
|
dst->gtt_size = vma->node.size;
|
2016-10-12 17:05:20 +08:00
|
|
|
dst->page_count = 0;
|
2016-10-12 17:05:22 +08:00
|
|
|
dst->unused = 0;
|
|
|
|
|
|
|
|
if (!compress_init(&zstream)) {
|
|
|
|
kfree(dst);
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-08-15 17:49:09 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
for_each_sgt_dma(dma, iter, vma->pages) {
|
|
|
|
void __iomem *s;
|
|
|
|
int ret;
|
2014-08-13 03:05:48 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
ggtt->base.insert_page(&ggtt->base, dma, slot,
|
|
|
|
I915_CACHE_NONE, 0);
|
2014-08-13 03:05:48 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
|
2016-10-12 17:05:22 +08:00
|
|
|
ret = compress_page(&zstream, (void __force *)s, dst);
|
2016-10-12 17:05:20 +08:00
|
|
|
io_mapping_unmap_atomic(s);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
if (ret)
|
2013-07-12 21:50:57 +08:00
|
|
|
goto unwind;
|
|
|
|
}
|
2016-10-12 17:05:20 +08:00
|
|
|
goto out;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
unwind:
|
2016-10-12 17:05:20 +08:00
|
|
|
while (dst->page_count--)
|
|
|
|
free_page((unsigned long)dst->pages[dst->page_count]);
|
2013-07-12 21:50:57 +08:00
|
|
|
kfree(dst);
|
2016-10-12 17:05:20 +08:00
|
|
|
dst = NULL;
|
|
|
|
|
|
|
|
out:
|
2016-10-12 17:05:22 +08:00
|
|
|
compress_fini(&zstream, dst);
|
2016-10-12 17:05:20 +08:00
|
|
|
ggtt->base.clear_range(&ggtt->base, slot, PAGE_SIZE, true);
|
|
|
|
return dst;
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-08-04 14:52:31 +08:00
|
|
|
/* The error capture is special as tries to run underneath the normal
|
|
|
|
* locking rules - so we use the raw version of the i915_gem_active lookup.
|
|
|
|
*/
|
|
|
|
static inline uint32_t
|
|
|
|
__active_get_seqno(struct i915_gem_active *active)
|
|
|
|
{
|
|
|
|
return i915_gem_request_get_seqno(__i915_gem_active_peek(active));
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
__active_get_engine_id(struct i915_gem_active *active)
|
|
|
|
{
|
|
|
|
struct intel_engine_cs *engine;
|
|
|
|
|
|
|
|
engine = i915_gem_request_get_engine(__i915_gem_active_peek(active));
|
|
|
|
return engine ? engine->id : -1;
|
|
|
|
}
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
static void capture_bo(struct drm_i915_error_buffer *err,
|
2014-08-13 03:05:47 +08:00
|
|
|
struct i915_vma *vma)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2014-08-13 03:05:47 +08:00
|
|
|
struct drm_i915_gem_object *obj = vma->obj;
|
2015-04-27 20:41:17 +08:00
|
|
|
int i;
|
2014-08-13 03:05:47 +08:00
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
err->size = obj->base.size;
|
|
|
|
err->name = obj->base.name;
|
2016-08-04 14:52:31 +08:00
|
|
|
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++)
|
2016-08-04 14:52:31 +08:00
|
|
|
err->rseqno[i] = __active_get_seqno(&obj->last_read[i]);
|
|
|
|
err->wseqno = __active_get_seqno(&obj->last_write);
|
|
|
|
err->engine = __active_get_engine_id(&obj->last_write);
|
|
|
|
|
2014-08-13 03:05:47 +08:00
|
|
|
err->gtt_offset = vma->node.start;
|
2013-07-12 21:50:57 +08:00
|
|
|
err->read_domains = obj->base.read_domains;
|
|
|
|
err->write_domain = obj->base.write_domain;
|
2016-08-19 00:17:00 +08:00
|
|
|
err->fence_reg = vma->fence ? vma->fence->id : -1;
|
2016-08-05 17:14:23 +08:00
|
|
|
err->tiling = i915_gem_object_get_tiling(obj);
|
2013-07-12 21:50:57 +08:00
|
|
|
err->dirty = obj->dirty;
|
|
|
|
err->purgeable = obj->madv != I915_MADV_WILLNEED;
|
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl
By exporting the ability to map user address and inserting PTEs
representing their backing pages into the GTT, we can exploit UMA in order
to utilize normal application data as a texture source or even as a
render target (depending upon the capabilities of the chipset). This has
a number of uses, with zero-copy downloads to the GPU and efficient
readback making the intermixed streaming of CPU and GPU operations
fairly efficient. This ability has many widespread implications from
faster rendering of client-side software rasterisers (chromium),
mitigation of stalls due to read back (firefox) and to faster pipelining
of texture data (such as pixel buffer objects in GL or data blobs in CL).
v2: Compile with CONFIG_MMU_NOTIFIER
v3: We can sleep while performing invalidate-range, which we can utilise
to drop our page references prior to the kernel manipulating the vma
(for either discard or cloning) and so protect normal users.
v4: Only run the invalidate notifier if the range intercepts the bo.
v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers
v6: Recheck after reacquire mutex for lost mmu.
v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary.
v8: Fix rebasing error after forwarding porting the back port.
v9: Limit the userptr to page aligned entries. We now expect userspace
to handle all the offset-in-page adjustments itself.
v10: Prevent vma from being copied across fork to avoid issues with cow.
v11: Drop vma behaviour changes -- locking is nigh on impossible.
Use a worker to load user pages to avoid lock inversions.
v12: Use get_task_mm()/mmput() for correct refcounting of mm.
v13: Use a worker to release the mmu_notifier to avoid lock inversion
v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer
with its own locking and tree of objects for each mm/mmu_notifier.
v15: Prevent overlapping userptr objects, and invalidate all objects
within the mmu_notifier range
v16: Fix a typo for iterating over multiple objects in the range and
rearrange error path to destroy the mmu_notifier locklessly.
Also close a race between invalidate_range and the get_pages_worker.
v17: Close a race between get_pages_worker/invalidate_range and fresh
allocations of the same userptr range - and notice that
struct_mutex was presumed to be held when during creation it wasn't.
v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory
for the struct sg_table and to clear it before reporting an error.
v19: Always error out on read-only userptr requests as we don't have the
hardware infrastructure to support them at the moment.
v20: Refuse to implement read-only support until we have the required
infrastructure - but reserve the bit in flags for future use.
v21: use_mm() is not required for get_user_pages(). It is only meant to
be used to fix up the kernel thread's current->mm for use with
copy_user().
v22: Use sg_alloc_table_from_pages for that chunky feeling
v23: Export a function for sanity checking dma-buf rather than encode
userptr details elsewhere, and clean up comments based on
suggestions by Bradley.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Akash Goel <akash.goel@intel.com>
Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com>
[danvet: Frob ioctl allocation to pick the next one - will cause a bit
of fuss with create2 apparently, but such are the rules.]
[danvet2: oops, forgot to git add after manual patch application]
[danvet3: Appease sparse.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
|
|
|
err->userptr = obj->userptr.mm != NULL;
|
2013-07-12 21:50:57 +08:00
|
|
|
err->cache_level = obj->cache_level;
|
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
static u32 capture_error_bo(struct drm_i915_error_buffer *err,
|
|
|
|
int count, struct list_head *head,
|
|
|
|
bool pinned_only)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2013-08-01 08:00:14 +08:00
|
|
|
struct i915_vma *vma;
|
2013-07-12 21:50:57 +08:00
|
|
|
int i = 0;
|
|
|
|
|
2016-02-26 19:03:19 +08:00
|
|
|
list_for_each_entry(vma, head, vm_link) {
|
2016-08-15 17:48:41 +08:00
|
|
|
if (pinned_only && !i915_vma_is_pinned(vma))
|
|
|
|
continue;
|
|
|
|
|
2014-08-13 03:05:47 +08:00
|
|
|
capture_bo(err++, vma);
|
2013-07-12 21:50:57 +08:00
|
|
|
if (++i == count)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
2014-02-04 20:18:55 +08:00
|
|
|
/* Generate a semi-unique error code. The code is not meant to have meaning, The
|
|
|
|
* code's only purpose is to try to prevent false duplicated bug reports by
|
|
|
|
* grossly estimating a GPU error state.
|
|
|
|
*
|
|
|
|
* TODO Ideally, hashing the batchbuffer would be a very nice way to determine
|
|
|
|
* the hang if we could strip the GTT offset information from it.
|
|
|
|
*
|
|
|
|
* It's only a small step better than a random number in its current form.
|
|
|
|
*/
|
|
|
|
static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
|
2014-02-25 23:11:25 +08:00
|
|
|
struct drm_i915_error_state *error,
|
2016-07-27 16:07:28 +08:00
|
|
|
int *engine_id)
|
2014-02-04 20:18:55 +08:00
|
|
|
{
|
|
|
|
uint32_t error_code = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* IPEHR would be an ideal way to detect errors, as it's the gross
|
|
|
|
* measure of "the command that hung." However, has some very common
|
|
|
|
* synchronization commands which almost always appear in the case
|
|
|
|
* strictly a client bug. Use instdone to differentiate those some.
|
|
|
|
*/
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++) {
|
2016-07-27 16:07:28 +08:00
|
|
|
if (error->engine[i].hangcheck_action == HANGCHECK_HUNG) {
|
|
|
|
if (engine_id)
|
|
|
|
*engine_id = i;
|
2014-02-25 23:11:25 +08:00
|
|
|
|
2016-09-20 21:54:32 +08:00
|
|
|
return error->engine[i].ipehr ^
|
|
|
|
error->engine[i].instdone.instdone;
|
2014-02-25 23:11:25 +08:00
|
|
|
}
|
|
|
|
}
|
2014-02-04 20:18:55 +08:00
|
|
|
|
|
|
|
return error_code;
|
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
static void i915_gem_record_fences(struct drm_i915_private *dev_priv,
|
2013-07-12 21:50:57 +08:00
|
|
|
struct drm_i915_error_state *error)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (IS_GEN3(dev_priv) || IS_GEN2(dev_priv)) {
|
2014-12-04 22:48:10 +08:00
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
2015-09-21 23:05:14 +08:00
|
|
|
error->fence[i] = I915_READ(FENCE_REG(i));
|
2016-05-06 22:40:21 +08:00
|
|
|
} else if (IS_GEN5(dev_priv) || IS_GEN4(dev_priv)) {
|
2015-09-21 23:05:14 +08:00
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
|
|
|
error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
|
2016-05-06 22:40:21 +08:00
|
|
|
} else if (INTEL_GEN(dev_priv) >= 6) {
|
2015-09-21 23:05:14 +08:00
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
|
|
|
error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2014-07-01 00:53:40 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void gen8_record_semaphore_state(struct drm_i915_error_state *error,
|
2016-03-16 19:00:37 +08:00
|
|
|
struct intel_engine_cs *engine,
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_error_engine *ee)
|
2014-07-01 00:53:41 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
struct intel_engine_cs *to;
|
2016-03-24 02:19:53 +08:00
|
|
|
enum intel_engine_id id;
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2016-08-15 17:49:02 +08:00
|
|
|
if (!error->semaphore)
|
2016-07-27 16:07:28 +08:00
|
|
|
return;
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2016-03-24 02:19:53 +08:00
|
|
|
for_each_engine_id(to, dev_priv, id) {
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
int idx;
|
|
|
|
u16 signal_offset;
|
|
|
|
u32 *tmp;
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2016-03-16 19:00:37 +08:00
|
|
|
if (engine == to)
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
continue;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
signal_offset =
|
|
|
|
(GEN8_SIGNAL_OFFSET(engine, id) & (PAGE_SIZE - 1)) / 4;
|
2016-08-15 17:49:02 +08:00
|
|
|
tmp = error->semaphore->pages[0];
|
2016-08-03 05:50:21 +08:00
|
|
|
idx = intel_engine_sync_index(engine, to);
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[idx] = tmp[signal_offset];
|
|
|
|
ee->semaphore_seqno[idx] = engine->semaphore.sync_seqno[idx];
|
2014-07-01 00:53:41 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
2014-07-01 00:53:40 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
|
|
|
|
|
|
|
ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base));
|
|
|
|
ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base));
|
|
|
|
ee->semaphore_seqno[0] = engine->semaphore.sync_seqno[0];
|
|
|
|
ee->semaphore_seqno[1] = engine->semaphore.sync_seqno[1];
|
2014-07-01 00:53:40 +08:00
|
|
|
|
2016-04-07 16:08:05 +08:00
|
|
|
if (HAS_VEBOX(dev_priv)) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[2] =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(RING_SYNC_2(engine->mmio_base));
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_seqno[2] = engine->semaphore.sync_seqno[2];
|
2014-07-01 00:53:40 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void error_record_engine_waiters(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
{
|
|
|
|
struct intel_breadcrumbs *b = &engine->breadcrumbs;
|
|
|
|
struct drm_i915_error_waiter *waiter;
|
|
|
|
struct rb_node *rb;
|
|
|
|
int count;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->num_waiters = 0;
|
|
|
|
ee->waiters = NULL;
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
if (RB_EMPTY_ROOT(&b->waiters))
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!spin_trylock(&b->lock)) {
|
|
|
|
ee->waiters = ERR_PTR(-EDEADLK);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
count = 0;
|
|
|
|
for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
|
|
|
|
count++;
|
|
|
|
spin_unlock(&b->lock);
|
|
|
|
|
|
|
|
waiter = NULL;
|
|
|
|
if (count)
|
|
|
|
waiter = kmalloc_array(count,
|
|
|
|
sizeof(struct drm_i915_error_waiter),
|
|
|
|
GFP_ATOMIC);
|
|
|
|
if (!waiter)
|
|
|
|
return;
|
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
if (!spin_trylock(&b->lock)) {
|
|
|
|
kfree(waiter);
|
|
|
|
ee->waiters = ERR_PTR(-EDEADLK);
|
|
|
|
return;
|
|
|
|
}
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
ee->waiters = waiter;
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
|
|
|
|
struct intel_wait *w = container_of(rb, typeof(*w), node);
|
|
|
|
|
|
|
|
strcpy(waiter->comm, w->tsk->comm);
|
|
|
|
waiter->pid = w->tsk->pid;
|
|
|
|
waiter->seqno = w->seqno;
|
|
|
|
waiter++;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (++ee->num_waiters == count)
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
spin_unlock(&b->lock);
|
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void error_record_engine_registers(struct drm_i915_error_state *error,
|
|
|
|
struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 6) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base));
|
|
|
|
ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8)
|
2016-07-27 16:07:28 +08:00
|
|
|
gen8_record_semaphore_state(error, engine, ee);
|
2014-07-01 00:53:41 +08:00
|
|
|
else
|
2016-07-27 16:07:28 +08:00
|
|
|
gen6_record_semaphore_state(engine, ee);
|
2013-08-13 07:53:04 +08:00
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 4) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base));
|
|
|
|
ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base));
|
|
|
|
ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
|
|
|
|
ee->instps = I915_READ(RING_INSTPS(engine->mmio_base));
|
|
|
|
ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base));
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32;
|
|
|
|
ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32;
|
2014-04-02 07:31:07 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base));
|
2013-07-12 21:50:57 +08:00
|
|
|
} else {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->faddr = I915_READ(DMA_FADD_I8XX);
|
|
|
|
ee->ipeir = I915_READ(IPEIR);
|
|
|
|
ee->ipehr = I915_READ(IPEHR);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:17 +08:00
|
|
|
intel_engine_get_instdone(engine, &ee->instdone);
|
2016-09-20 21:54:32 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->waiting = intel_engine_has_waiter(engine);
|
|
|
|
ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
|
2016-08-03 05:50:21 +08:00
|
|
|
ee->acthd = intel_engine_get_active_head(engine);
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->seqno = intel_engine_get_seqno(engine);
|
|
|
|
ee->last_seqno = engine->last_submitted_seqno;
|
|
|
|
ee->start = I915_READ_START(engine);
|
|
|
|
ee->head = I915_READ_HEAD(engine);
|
|
|
|
ee->tail = I915_READ_TAIL(engine);
|
|
|
|
ee->ctl = I915_READ_CTL(engine);
|
2016-08-15 17:49:11 +08:00
|
|
|
if (INTEL_GEN(dev_priv) > 2)
|
|
|
|
ee->mode = I915_READ_MODE(engine);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-18 03:30:56 +08:00
|
|
|
if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
|
drm/i915: Type safe register read/write
Make I915_READ and I915_WRITE more type safe by wrapping the register
offset in a struct. This should eliminate most of the fumbles we've had
with misplaced parens.
This only takes care of normal mmio registers. We could extend the idea
to other register types and define each with its own struct. That way
you wouldn't be able to accidentally pass the wrong thing to a specific
register access function.
The gpio_reg setup is probably the ugliest thing left. But I figure I'd
just leave it for now, and wait for some divine inspiration to strike
before making it nice.
As for the generated code, it's actually a bit better sometimes. Eg.
looking at i915_irq_handler(), we can see the following change:
lea 0x70024(%rdx,%rax,1),%r9d
mov $0x1,%edx
- movslq %r9d,%r9
- mov %r9,%rsi
- mov %r9,-0x58(%rbp)
- callq *0xd8(%rbx)
+ mov %r9d,%esi
+ mov %r9d,-0x48(%rbp)
callq *0xd8(%rbx)
So previously gcc thought the register offset might be signed and
decided to sign extend it, just in case. The rest appears to be
mostly just minor shuffling of instructions.
v2: i915_mmio_reg_{offset,equal,valid}() helpers added
s/_REG/_MMIO/ in the register defines
mo more switch statements left to worry about
ring_emit stuff got sorted in a prep patch
cmd parser, lrc context and w/a batch buildup also in prep patch
vgpu stuff cleaned up and moved to a prep patch
all other unrelated changes split out
v3: Rebased due to BXT DSI/BLC, MOCS, etc.
v4: Rebased due to churn, s/i915_mmio_reg_t/i915_reg_t/
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1447853606-2751-1-git-send-email-ville.syrjala@linux.intel.com
2015-11-18 21:33:26 +08:00
|
|
|
i915_reg_t mmio;
|
2014-01-24 06:40:36 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (IS_GEN7(dev_priv)) {
|
2016-03-16 19:00:37 +08:00
|
|
|
switch (engine->id) {
|
2014-01-24 06:40:36 +08:00
|
|
|
default:
|
|
|
|
case RCS:
|
|
|
|
mmio = RENDER_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
case BCS:
|
|
|
|
mmio = BLT_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
case VCS:
|
|
|
|
mmio = BSD_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
case VECS:
|
|
|
|
mmio = VEBOX_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
}
|
2016-05-06 22:40:21 +08:00
|
|
|
} else if (IS_GEN6(engine->i915)) {
|
2016-03-16 19:00:37 +08:00
|
|
|
mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
|
2014-01-24 06:40:36 +08:00
|
|
|
} else {
|
|
|
|
/* XXX: gen8 returns to sanity */
|
2016-03-16 19:00:37 +08:00
|
|
|
mmio = RING_HWS_PGA(engine->mmio_base);
|
2014-01-24 06:40:36 +08:00
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->hws = I915_READ(mmio);
|
2014-01-24 06:40:36 +08:00
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->hangcheck_score = engine->hangcheck.score;
|
|
|
|
ee->hangcheck_action = engine->hangcheck.action;
|
2014-01-30 16:19:40 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (USES_PPGTT(dev_priv)) {
|
2014-01-30 16:19:40 +08:00
|
|
|
int i;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
|
2014-01-30 16:19:40 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (IS_GEN6(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pp_dir_base =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(RING_PP_DIR_BASE_READ(engine));
|
2016-05-06 22:40:21 +08:00
|
|
|
else if (IS_GEN7(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pp_dir_base =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(RING_PP_DIR_BASE(engine));
|
2016-05-06 22:40:21 +08:00
|
|
|
else if (INTEL_GEN(dev_priv) >= 8)
|
2014-01-30 16:19:40 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pdp[i] =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(GEN8_RING_PDP_UDW(engine, i));
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pdp[i] <<= 32;
|
|
|
|
ee->vm_info.pdp[i] |=
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(GEN8_RING_PDP_LDW(engine, i));
|
2014-01-30 16:19:40 +08:00
|
|
|
}
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
static void record_request(struct drm_i915_gem_request *request,
|
|
|
|
struct drm_i915_error_request *erq)
|
|
|
|
{
|
|
|
|
erq->context = request->ctx->hw_id;
|
|
|
|
erq->seqno = request->fence.seqno;
|
|
|
|
erq->jiffies = request->emitted_jiffies;
|
|
|
|
erq->head = request->head;
|
|
|
|
erq->tail = request->tail;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2016-08-15 17:49:10 +08:00
|
|
|
static void engine_record_requests(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_gem_request *first,
|
|
|
|
struct drm_i915_error_engine *ee)
|
|
|
|
{
|
|
|
|
struct drm_i915_gem_request *request;
|
|
|
|
int count;
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
request = first;
|
|
|
|
list_for_each_entry_from(request, &engine->request_list, link)
|
|
|
|
count++;
|
|
|
|
if (!count)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
|
|
|
|
if (!ee->requests)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ee->num_requests = count;
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
request = first;
|
|
|
|
list_for_each_entry_from(request, &engine->request_list, link) {
|
|
|
|
if (count >= ee->num_requests) {
|
|
|
|
/*
|
|
|
|
* If the ring request list was changed in
|
|
|
|
* between the point where the error request
|
|
|
|
* list was created and dimensioned and this
|
|
|
|
* point then just exit early to avoid crashes.
|
|
|
|
*
|
|
|
|
* We don't need to communicate that the
|
|
|
|
* request list changed state during error
|
|
|
|
* state capture and that the error state is
|
|
|
|
* slightly incorrect as a consequence since we
|
|
|
|
* are typically only interested in the request
|
|
|
|
* list state at the point of error state
|
|
|
|
* capture, not in any changes happening during
|
|
|
|
* the capture.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
record_request(request, &ee->requests[count++]);
|
2016-08-15 17:49:10 +08:00
|
|
|
}
|
|
|
|
ee->num_requests = count;
|
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
static void error_record_engine_execlists(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
|
|
|
{
|
|
|
|
unsigned int n;
|
|
|
|
|
|
|
|
for (n = 0; n < ARRAY_SIZE(engine->execlist_port); n++)
|
|
|
|
if (engine->execlist_port[n].request)
|
|
|
|
record_request(engine->execlist_port[n].request,
|
|
|
|
&ee->execlist[n]);
|
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
|
2013-07-12 21:50:57 +08:00
|
|
|
struct drm_i915_error_state *error)
|
|
|
|
{
|
2016-03-30 21:57:10 +08:00
|
|
|
struct i915_ggtt *ggtt = &dev_priv->ggtt;
|
2016-08-15 17:49:10 +08:00
|
|
|
int i;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:49:02 +08:00
|
|
|
error->semaphore =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv, dev_priv->semaphore);
|
2016-07-27 16:07:28 +08:00
|
|
|
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++) {
|
2016-03-16 19:00:38 +08:00
|
|
|
struct intel_engine_cs *engine = &dev_priv->engine[i];
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
2016-08-15 17:49:10 +08:00
|
|
|
struct drm_i915_gem_request *request;
|
2014-01-27 21:52:34 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->pid = -1;
|
|
|
|
ee->engine_id = -1;
|
2014-06-10 19:09:29 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (!intel_engine_initialized(engine))
|
2014-01-27 21:52:34 +08:00
|
|
|
continue;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->engine_id = i;
|
2014-01-27 21:52:34 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
error_record_engine_registers(error, engine, ee);
|
|
|
|
error_record_engine_waiters(engine, ee);
|
2016-10-13 18:18:14 +08:00
|
|
|
error_record_engine_execlists(engine, ee);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-03-16 19:00:36 +08:00
|
|
|
request = i915_gem_find_active_request(engine);
|
2014-02-25 23:11:24 +08:00
|
|
|
if (request) {
|
2016-08-03 05:50:21 +08:00
|
|
|
struct intel_ring *ring;
|
2016-08-15 17:49:08 +08:00
|
|
|
struct pid *pid;
|
2014-08-06 21:04:53 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
ee->vm = request->ctx->ppgtt ?
|
2016-07-04 15:08:39 +08:00
|
|
|
&request->ctx->ppgtt->base : &ggtt->base;
|
2014-08-06 21:04:53 +08:00
|
|
|
|
2014-02-25 23:11:24 +08:00
|
|
|
/* We need to copy these to an anonymous buffer
|
|
|
|
* as the simplest method to avoid being overwritten
|
|
|
|
* by userspace.
|
|
|
|
*/
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->batchbuffer =
|
2014-02-25 23:11:24 +08:00
|
|
|
i915_error_object_create(dev_priv,
|
2016-08-15 17:49:06 +08:00
|
|
|
request->batch);
|
2014-02-25 23:11:24 +08:00
|
|
|
|
2016-04-07 16:08:05 +08:00
|
|
|
if (HAS_BROKEN_CS_TLB(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->wa_batchbuffer =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv,
|
|
|
|
engine->scratch);
|
2014-02-25 23:11:24 +08:00
|
|
|
|
2016-08-15 17:49:06 +08:00
|
|
|
ee->ctx =
|
|
|
|
i915_error_object_create(dev_priv,
|
|
|
|
request->ctx->engine[i].state);
|
2016-08-15 17:48:42 +08:00
|
|
|
|
2016-08-15 17:49:08 +08:00
|
|
|
pid = request->ctx->pid;
|
|
|
|
if (pid) {
|
2014-02-25 23:11:24 +08:00
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
2016-08-15 17:49:08 +08:00
|
|
|
task = pid_task(pid, PIDTYPE_PID);
|
2014-02-25 23:11:24 +08:00
|
|
|
if (task) {
|
2016-07-27 16:07:28 +08:00
|
|
|
strcpy(ee->comm, task->comm);
|
|
|
|
ee->pid = task->pid;
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-07-04 15:08:39 +08:00
|
|
|
error->simulated |=
|
|
|
|
request->ctx->flags & CONTEXT_NO_ERROR_CAPTURE;
|
|
|
|
|
2016-10-05 04:11:30 +08:00
|
|
|
ee->rq_head = request->head;
|
|
|
|
ee->rq_post = request->postfix;
|
|
|
|
ee->rq_tail = request->tail;
|
|
|
|
|
2016-08-03 05:50:19 +08:00
|
|
|
ring = request->ring;
|
|
|
|
ee->cpu_ring_head = ring->head;
|
|
|
|
ee->cpu_ring_tail = ring->tail;
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->ringbuffer =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv, ring->vma);
|
2016-08-15 17:49:10 +08:00
|
|
|
|
|
|
|
engine_record_requests(engine, request, ee);
|
2016-07-04 15:08:38 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->hws_page =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv,
|
|
|
|
engine->status_page.vma);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:49:06 +08:00
|
|
|
ee->wa_ctx =
|
|
|
|
i915_error_object_create(dev_priv, engine->wa_ctx.vma);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-01 08:00:15 +08:00
|
|
|
static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
|
|
|
|
struct drm_i915_error_state *error,
|
|
|
|
struct i915_address_space *vm,
|
2016-08-15 17:48:41 +08:00
|
|
|
int idx)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-08-15 17:48:41 +08:00
|
|
|
struct drm_i915_error_buffer *active_bo;
|
2013-08-01 08:00:15 +08:00
|
|
|
struct i915_vma *vma;
|
2016-08-15 17:48:41 +08:00
|
|
|
int count;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
count = 0;
|
2016-02-26 19:03:19 +08:00
|
|
|
list_for_each_entry(vma, &vm->active_list, vm_link)
|
2016-08-15 17:48:41 +08:00
|
|
|
count++;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
active_bo = NULL;
|
|
|
|
if (count)
|
|
|
|
active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
|
2013-08-01 08:00:15 +08:00
|
|
|
if (active_bo)
|
2016-08-15 17:48:41 +08:00
|
|
|
count = capture_error_bo(active_bo, count, &vm->active_list, false);
|
|
|
|
else
|
|
|
|
count = 0;
|
|
|
|
|
|
|
|
error->active_vm[idx] = vm;
|
|
|
|
error->active_bo[idx] = active_bo;
|
|
|
|
error->active_bo_count[idx] = count;
|
2013-08-01 08:00:15 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
static void i915_capture_active_buffers(struct drm_i915_private *dev_priv,
|
|
|
|
struct drm_i915_error_state *error)
|
2013-08-01 08:00:15 +08:00
|
|
|
{
|
2016-08-15 17:48:41 +08:00
|
|
|
int cnt = 0, i, j;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
|
|
|
|
|
|
|
|
/* Scan each engine looking for unique active contexts/vm */
|
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
if (!ee->vm)
|
|
|
|
continue;
|
2014-08-13 03:05:47 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
found = false;
|
|
|
|
for (j = 0; j < i && !found; j++)
|
|
|
|
found = error->engine[j].vm == ee->vm;
|
|
|
|
if (!found)
|
|
|
|
i915_gem_capture_vm(dev_priv, error, ee->vm, cnt++);
|
2014-08-13 03:05:47 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
static void i915_capture_pinned_buffers(struct drm_i915_private *dev_priv,
|
|
|
|
struct drm_i915_error_state *error)
|
|
|
|
{
|
|
|
|
struct i915_address_space *vm = &dev_priv->ggtt.base;
|
|
|
|
struct drm_i915_error_buffer *bo;
|
|
|
|
struct i915_vma *vma;
|
|
|
|
int count_inactive, count_active;
|
|
|
|
|
|
|
|
count_inactive = 0;
|
|
|
|
list_for_each_entry(vma, &vm->active_list, vm_link)
|
|
|
|
count_inactive++;
|
|
|
|
|
|
|
|
count_active = 0;
|
|
|
|
list_for_each_entry(vma, &vm->inactive_list, vm_link)
|
|
|
|
count_active++;
|
|
|
|
|
|
|
|
bo = NULL;
|
|
|
|
if (count_inactive + count_active)
|
|
|
|
bo = kcalloc(count_inactive + count_active,
|
|
|
|
sizeof(*bo), GFP_ATOMIC);
|
|
|
|
if (!bo)
|
|
|
|
return;
|
|
|
|
|
|
|
|
count_inactive = capture_error_bo(bo, count_inactive,
|
|
|
|
&vm->active_list, true);
|
|
|
|
count_active = capture_error_bo(bo + count_inactive, count_active,
|
|
|
|
&vm->inactive_list, true);
|
|
|
|
error->pinned_bo_count = count_inactive + count_active;
|
|
|
|
error->pinned_bo = bo;
|
|
|
|
}
|
|
|
|
|
2014-01-30 16:19:35 +08:00
|
|
|
/* Capture all registers which don't fit into another category. */
|
|
|
|
static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
|
|
|
|
struct drm_i915_error_state *error)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-07-05 17:40:23 +08:00
|
|
|
struct drm_device *dev = &dev_priv->drm;
|
2014-08-06 01:07:13 +08:00
|
|
|
int i;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* General organization
|
|
|
|
* 1. Registers specific to a single generation
|
|
|
|
* 2. Registers which belong to multiple generations
|
|
|
|
* 3. Feature specific registers.
|
|
|
|
* 4. Everything else
|
|
|
|
* Please try to follow the order.
|
|
|
|
*/
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* 1: Registers specific to a single generation */
|
|
|
|
if (IS_VALLEYVIEW(dev)) {
|
2014-08-06 01:07:13 +08:00
|
|
|
error->gtier[0] = I915_READ(GTIER);
|
2014-08-02 00:12:27 +08:00
|
|
|
error->ier = I915_READ(VLV_IER);
|
2015-10-22 20:34:57 +08:00
|
|
|
error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
|
2014-01-30 16:19:36 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
if (IS_GEN7(dev))
|
|
|
|
error->err_int = I915_READ(GEN7_ERR_INT);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2015-03-24 20:54:19 +08:00
|
|
|
if (INTEL_INFO(dev)->gen >= 8) {
|
|
|
|
error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
|
|
|
|
error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
|
|
|
|
}
|
|
|
|
|
2014-01-30 16:19:39 +08:00
|
|
|
if (IS_GEN6(dev)) {
|
2015-10-22 20:34:57 +08:00
|
|
|
error->forcewake = I915_READ_FW(FORCEWAKE);
|
2014-01-30 16:19:39 +08:00
|
|
|
error->gab_ctl = I915_READ(GAB_CTL);
|
|
|
|
error->gfx_mode = I915_READ(GFX_MODE);
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* 2: Registers which belong to multiple generations */
|
|
|
|
if (INTEL_INFO(dev)->gen >= 7)
|
2015-10-22 20:34:57 +08:00
|
|
|
error->forcewake = I915_READ_FW(FORCEWAKE_MT);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
if (INTEL_INFO(dev)->gen >= 6) {
|
2014-01-30 16:19:36 +08:00
|
|
|
error->derrmr = I915_READ(DERRMR);
|
2013-07-12 21:50:57 +08:00
|
|
|
error->error = I915_READ(ERROR_GEN6);
|
|
|
|
error->done_reg = I915_READ(DONE_REG);
|
|
|
|
}
|
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* 3: Feature specific registers */
|
2014-01-30 16:19:39 +08:00
|
|
|
if (IS_GEN6(dev) || IS_GEN7(dev)) {
|
|
|
|
error->gam_ecochk = I915_READ(GAM_ECOCHK);
|
|
|
|
error->gac_eco = I915_READ(GAC_ECO_BITS);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 4: Everything else */
|
2014-01-30 16:19:36 +08:00
|
|
|
if (HAS_HW_CONTEXTS(dev))
|
|
|
|
error->ccid = I915_READ(CCID);
|
|
|
|
|
2014-08-06 01:07:13 +08:00
|
|
|
if (INTEL_INFO(dev)->gen >= 8) {
|
|
|
|
error->ier = I915_READ(GEN8_DE_MISC_IER);
|
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
error->gtier[i] = I915_READ(GEN8_GT_IER(i));
|
|
|
|
} else if (HAS_PCH_SPLIT(dev)) {
|
2014-08-02 00:12:27 +08:00
|
|
|
error->ier = I915_READ(DEIER);
|
2014-08-06 01:07:13 +08:00
|
|
|
error->gtier[0] = I915_READ(GTIER);
|
2014-08-02 00:12:27 +08:00
|
|
|
} else if (IS_GEN2(dev)) {
|
|
|
|
error->ier = I915_READ16(IER);
|
|
|
|
} else if (!IS_VALLEYVIEW(dev)) {
|
|
|
|
error->ier = I915_READ(IER);
|
2014-01-30 16:19:36 +08:00
|
|
|
}
|
|
|
|
error->eir = I915_READ(EIR);
|
|
|
|
error->pgtbl_er = I915_READ(PGTBL_ER);
|
2014-01-30 16:19:35 +08:00
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
|
2014-02-25 23:11:26 +08:00
|
|
|
struct drm_i915_error_state *error,
|
2016-03-19 04:07:55 +08:00
|
|
|
u32 engine_mask,
|
2014-02-25 23:11:26 +08:00
|
|
|
const char *error_msg)
|
2014-02-25 23:11:25 +08:00
|
|
|
{
|
|
|
|
u32 ecode;
|
2016-07-27 16:07:28 +08:00
|
|
|
int engine_id = -1, len;
|
2014-02-25 23:11:25 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ecode = i915_error_generate_code(dev_priv, error, &engine_id);
|
2014-02-25 23:11:25 +08:00
|
|
|
|
2014-02-25 23:11:26 +08:00
|
|
|
len = scnprintf(error->error_msg, sizeof(error->error_msg),
|
2014-11-06 19:03:46 +08:00
|
|
|
"GPU HANG: ecode %d:%d:0x%08x",
|
2016-07-27 16:07:28 +08:00
|
|
|
INTEL_GEN(dev_priv), engine_id, ecode);
|
2014-02-25 23:11:26 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (engine_id != -1 && error->engine[engine_id].pid != -1)
|
2014-02-25 23:11:26 +08:00
|
|
|
len += scnprintf(error->error_msg + len,
|
|
|
|
sizeof(error->error_msg) - len,
|
|
|
|
", in %s [%d]",
|
2016-07-27 16:07:28 +08:00
|
|
|
error->engine[engine_id].comm,
|
|
|
|
error->engine[engine_id].pid);
|
2014-02-25 23:11:26 +08:00
|
|
|
|
|
|
|
scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
|
|
|
|
", reason: %s, action: %s",
|
|
|
|
error_msg,
|
2016-03-19 04:07:55 +08:00
|
|
|
engine_mask ? "reset" : "continue");
|
2014-02-25 23:11:25 +08:00
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:27 +08:00
|
|
|
static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
|
|
|
|
struct drm_i915_error_state *error)
|
|
|
|
{
|
2015-08-08 03:24:15 +08:00
|
|
|
error->iommu = -1;
|
|
|
|
#ifdef CONFIG_INTEL_IOMMU
|
|
|
|
error->iommu = intel_iommu_gfx_mapped;
|
|
|
|
#endif
|
2014-02-25 23:11:27 +08:00
|
|
|
error->reset_count = i915_reset_count(&dev_priv->gpu_error);
|
2014-02-25 23:11:28 +08:00
|
|
|
error->suspend_count = dev_priv->suspend_count;
|
2016-08-15 17:48:45 +08:00
|
|
|
|
|
|
|
memcpy(&error->device_info,
|
|
|
|
INTEL_INFO(dev_priv),
|
|
|
|
sizeof(error->device_info));
|
2014-02-25 23:11:27 +08:00
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:19 +08:00
|
|
|
static int capture(void *data)
|
|
|
|
{
|
|
|
|
struct drm_i915_error_state *error = data;
|
|
|
|
|
|
|
|
i915_capture_gen_state(error->i915, error);
|
|
|
|
i915_capture_reg_state(error->i915, error);
|
|
|
|
i915_gem_record_fences(error->i915, error);
|
|
|
|
i915_gem_record_rings(error->i915, error);
|
|
|
|
i915_capture_active_buffers(error->i915, error);
|
|
|
|
i915_capture_pinned_buffers(error->i915, error);
|
|
|
|
|
|
|
|
do_gettimeofday(&error->time);
|
|
|
|
|
|
|
|
error->overlay = intel_overlay_capture_error_state(error->i915);
|
|
|
|
error->display = intel_display_capture_error_state(error->i915);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2014-01-30 16:19:35 +08:00
|
|
|
/**
|
|
|
|
* i915_capture_error_state - capture an error record for later analysis
|
|
|
|
* @dev: drm device
|
|
|
|
*
|
|
|
|
* Should be called when an error is detected (either a hang or an error
|
|
|
|
* interrupt) to capture error state from the time of the error. Fills
|
|
|
|
* out a structure which becomes available in debugfs for user level tools
|
|
|
|
* to pick up.
|
|
|
|
*/
|
2016-05-06 22:40:21 +08:00
|
|
|
void i915_capture_error_state(struct drm_i915_private *dev_priv,
|
|
|
|
u32 engine_mask,
|
2014-02-25 23:11:26 +08:00
|
|
|
const char *error_msg)
|
2014-01-30 16:19:35 +08:00
|
|
|
{
|
2014-01-30 22:38:15 +08:00
|
|
|
static bool warned;
|
2014-01-30 16:19:35 +08:00
|
|
|
struct drm_i915_error_state *error;
|
|
|
|
unsigned long flags;
|
|
|
|
|
2016-10-12 17:05:18 +08:00
|
|
|
if (!i915.error_capture)
|
|
|
|
return;
|
|
|
|
|
2016-07-04 15:48:33 +08:00
|
|
|
if (READ_ONCE(dev_priv->gpu_error.first_error))
|
|
|
|
return;
|
|
|
|
|
2014-01-30 16:19:35 +08:00
|
|
|
/* Account for pipe specific data like PIPE*STAT */
|
|
|
|
error = kzalloc(sizeof(*error), GFP_ATOMIC);
|
|
|
|
if (!error) {
|
|
|
|
DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2014-02-04 20:18:55 +08:00
|
|
|
kref_init(&error->ref);
|
2016-10-12 17:05:19 +08:00
|
|
|
error->i915 = dev_priv;
|
2014-02-04 20:18:55 +08:00
|
|
|
|
2016-10-12 17:05:19 +08:00
|
|
|
stop_machine(capture, error, NULL);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
|
2014-02-25 23:11:25 +08:00
|
|
|
DRM_INFO("%s\n", error->error_msg);
|
|
|
|
|
2016-07-04 15:08:39 +08:00
|
|
|
if (!error->simulated) {
|
|
|
|
spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
|
|
|
|
if (!dev_priv->gpu_error.first_error) {
|
|
|
|
dev_priv->gpu_error.first_error = error;
|
|
|
|
error = NULL;
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:25 +08:00
|
|
|
if (error) {
|
2013-07-12 21:50:57 +08:00
|
|
|
i915_error_state_free(&error->ref);
|
2014-02-25 23:11:25 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!warned) {
|
|
|
|
DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
|
|
|
|
DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
|
|
|
|
DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
|
|
|
|
DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
|
2016-07-05 17:40:23 +08:00
|
|
|
DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
|
|
|
|
dev_priv->drm.primary->index);
|
2014-02-25 23:11:25 +08:00
|
|
|
warned = true;
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void i915_error_state_get(struct drm_device *dev,
|
|
|
|
struct i915_error_state_file_priv *error_priv)
|
|
|
|
{
|
2016-07-04 18:34:36 +08:00
|
|
|
struct drm_i915_private *dev_priv = to_i915(dev);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-09-15 20:55:24 +08:00
|
|
|
spin_lock_irq(&dev_priv->gpu_error.lock);
|
2013-07-12 21:50:57 +08:00
|
|
|
error_priv->error = dev_priv->gpu_error.first_error;
|
|
|
|
if (error_priv->error)
|
|
|
|
kref_get(&error_priv->error->ref);
|
2014-09-15 20:55:24 +08:00
|
|
|
spin_unlock_irq(&dev_priv->gpu_error.lock);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void i915_error_state_put(struct i915_error_state_file_priv *error_priv)
|
|
|
|
{
|
|
|
|
if (error_priv->error)
|
|
|
|
kref_put(&error_priv->error->ref, i915_error_state_free);
|
|
|
|
}
|
|
|
|
|
|
|
|
void i915_destroy_error_state(struct drm_device *dev)
|
|
|
|
{
|
2016-07-04 18:34:36 +08:00
|
|
|
struct drm_i915_private *dev_priv = to_i915(dev);
|
2013-07-12 21:50:57 +08:00
|
|
|
struct drm_i915_error_state *error;
|
|
|
|
|
2014-09-15 20:55:24 +08:00
|
|
|
spin_lock_irq(&dev_priv->gpu_error.lock);
|
2013-07-12 21:50:57 +08:00
|
|
|
error = dev_priv->gpu_error.first_error;
|
|
|
|
dev_priv->gpu_error.first_error = NULL;
|
2014-09-15 20:55:24 +08:00
|
|
|
spin_unlock_irq(&dev_priv->gpu_error.lock);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
if (error)
|
|
|
|
kref_put(&error->ref, i915_error_state_free);
|
|
|
|
}
|