2013-07-12 21:50:57 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2008 Intel Corporation
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
* Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
* IN THE SOFTWARE.
|
|
|
|
*
|
|
|
|
* Authors:
|
|
|
|
* Eric Anholt <eric@anholt.net>
|
|
|
|
* Keith Packard <keithp@keithp.com>
|
|
|
|
* Mika Kuoppala <mika.kuoppala@intel.com>
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <generated/utsrelease.h>
|
2016-10-12 17:05:19 +08:00
|
|
|
#include <linux/stop_machine.h>
|
2016-10-12 17:05:22 +08:00
|
|
|
#include <linux/zlib.h>
|
2013-07-12 21:50:57 +08:00
|
|
|
#include "i915_drv.h"
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static const char *engine_str(int engine)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
switch (engine) {
|
2013-07-12 21:50:57 +08:00
|
|
|
case RCS: return "render";
|
|
|
|
case VCS: return "bsd";
|
|
|
|
case BCS: return "blt";
|
|
|
|
case VECS: return "vebox";
|
2014-04-17 10:37:37 +08:00
|
|
|
case VCS2: return "bsd2";
|
2013-07-12 21:50:57 +08:00
|
|
|
default: return "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *tiling_flag(int tiling)
|
|
|
|
{
|
|
|
|
switch (tiling) {
|
|
|
|
default:
|
|
|
|
case I915_TILING_NONE: return "";
|
|
|
|
case I915_TILING_X: return " X";
|
|
|
|
case I915_TILING_Y: return " Y";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *dirty_flag(int dirty)
|
|
|
|
{
|
|
|
|
return dirty ? " dirty" : "";
|
|
|
|
}
|
|
|
|
|
|
|
|
static const char *purgeable_flag(int purgeable)
|
|
|
|
{
|
|
|
|
return purgeable ? " purgeable" : "";
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __i915_error_ok(struct drm_i915_error_state_buf *e)
|
|
|
|
{
|
|
|
|
|
|
|
|
if (!e->err && WARN(e->bytes > (e->size - 1), "overflow")) {
|
|
|
|
e->err = -ENOSPC;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (e->bytes == e->size - 1 || e->err)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __i915_error_seek(struct drm_i915_error_state_buf *e,
|
|
|
|
unsigned len)
|
|
|
|
{
|
|
|
|
if (e->pos + len <= e->start) {
|
|
|
|
e->pos += len;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* First vsnprintf needs to fit in its entirety for memmove */
|
|
|
|
if (len >= e->size) {
|
|
|
|
e->err = -EIO;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __i915_error_advance(struct drm_i915_error_state_buf *e,
|
|
|
|
unsigned len)
|
|
|
|
{
|
|
|
|
/* If this is first printf in this window, adjust it so that
|
|
|
|
* start position matches start of the buffer
|
|
|
|
*/
|
|
|
|
|
|
|
|
if (e->pos < e->start) {
|
|
|
|
const size_t off = e->start - e->pos;
|
|
|
|
|
|
|
|
/* Should not happen but be paranoid */
|
|
|
|
if (off > len || e->bytes) {
|
|
|
|
e->err = -EIO;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
memmove(e->buf, e->buf + off, len - off);
|
|
|
|
e->bytes = len - off;
|
|
|
|
e->pos = e->start;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
e->bytes += len;
|
|
|
|
e->pos += len;
|
|
|
|
}
|
|
|
|
|
2017-01-14 18:51:12 +08:00
|
|
|
__printf(2, 0)
|
2013-07-12 21:50:57 +08:00
|
|
|
static void i915_error_vprintf(struct drm_i915_error_state_buf *e,
|
|
|
|
const char *f, va_list args)
|
|
|
|
{
|
|
|
|
unsigned len;
|
|
|
|
|
|
|
|
if (!__i915_error_ok(e))
|
|
|
|
return;
|
|
|
|
|
|
|
|
/* Seek the first printf which is hits start position */
|
|
|
|
if (e->pos < e->start) {
|
2013-09-20 17:20:59 +08:00
|
|
|
va_list tmp;
|
|
|
|
|
|
|
|
va_copy(tmp, args);
|
2014-02-07 23:40:50 +08:00
|
|
|
len = vsnprintf(NULL, 0, f, tmp);
|
|
|
|
va_end(tmp);
|
|
|
|
|
|
|
|
if (!__i915_error_seek(e, len))
|
2013-07-12 21:50:57 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
len = vsnprintf(e->buf + e->bytes, e->size - e->bytes, f, args);
|
|
|
|
if (len >= e->size - e->bytes)
|
|
|
|
len = e->size - e->bytes - 1;
|
|
|
|
|
|
|
|
__i915_error_advance(e, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void i915_error_puts(struct drm_i915_error_state_buf *e,
|
|
|
|
const char *str)
|
|
|
|
{
|
|
|
|
unsigned len;
|
|
|
|
|
|
|
|
if (!__i915_error_ok(e))
|
|
|
|
return;
|
|
|
|
|
|
|
|
len = strlen(str);
|
|
|
|
|
|
|
|
/* Seek the first printf which is hits start position */
|
|
|
|
if (e->pos < e->start) {
|
|
|
|
if (!__i915_error_seek(e, len))
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (len >= e->size - e->bytes)
|
|
|
|
len = e->size - e->bytes - 1;
|
|
|
|
memcpy(e->buf + e->bytes, str, len);
|
|
|
|
|
|
|
|
__i915_error_advance(e, len);
|
|
|
|
}
|
|
|
|
|
|
|
|
#define err_printf(e, ...) i915_error_printf(e, __VA_ARGS__)
|
|
|
|
#define err_puts(e, s) i915_error_puts(e, s)
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
struct compress {
|
|
|
|
struct z_stream_s zstream;
|
|
|
|
void *tmp;
|
|
|
|
};
|
|
|
|
|
|
|
|
static bool compress_init(struct compress *c)
|
2016-10-12 17:05:22 +08:00
|
|
|
{
|
2016-12-06 20:40:51 +08:00
|
|
|
struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
|
2016-10-12 17:05:22 +08:00
|
|
|
|
|
|
|
zstream->workspace =
|
|
|
|
kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
|
|
|
|
GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
if (!zstream->workspace)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
|
|
|
|
kfree(zstream->workspace);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
c->tmp = NULL;
|
2017-01-06 23:20:09 +08:00
|
|
|
if (i915_has_memcpy_from_wc())
|
2016-12-06 20:40:51 +08:00
|
|
|
c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
static int compress_page(struct compress *c,
|
2016-10-12 17:05:22 +08:00
|
|
|
void *src,
|
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
2016-12-06 20:40:51 +08:00
|
|
|
struct z_stream_s *zstream = &c->zstream;
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
zstream->next_in = src;
|
2016-12-06 20:40:51 +08:00
|
|
|
if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
|
|
|
|
zstream->next_in = c->tmp;
|
2016-10-12 17:05:22 +08:00
|
|
|
zstream->avail_in = PAGE_SIZE;
|
|
|
|
|
|
|
|
do {
|
|
|
|
if (zstream->avail_out == 0) {
|
|
|
|
unsigned long page;
|
|
|
|
|
|
|
|
page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
dst->pages[dst->page_count++] = (void *)page;
|
|
|
|
|
|
|
|
zstream->next_out = (void *)page;
|
|
|
|
zstream->avail_out = PAGE_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (zlib_deflate(zstream, Z_SYNC_FLUSH) != Z_OK)
|
|
|
|
return -EIO;
|
|
|
|
} while (zstream->avail_in);
|
|
|
|
|
|
|
|
/* Fallback to uncompressed if we increase size? */
|
|
|
|
if (0 && zstream->total_out > zstream->total_in)
|
|
|
|
return -E2BIG;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
static void compress_fini(struct compress *c,
|
2016-10-12 17:05:22 +08:00
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
2016-12-06 20:40:51 +08:00
|
|
|
struct z_stream_s *zstream = &c->zstream;
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
if (dst) {
|
|
|
|
zlib_deflate(zstream, Z_FINISH);
|
|
|
|
dst->unused = zstream->avail_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
zlib_deflateEnd(zstream);
|
|
|
|
kfree(zstream->workspace);
|
2016-12-06 20:40:51 +08:00
|
|
|
|
|
|
|
if (c->tmp)
|
|
|
|
free_page((unsigned long)c->tmp);
|
2016-10-12 17:05:22 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void err_compression_marker(struct drm_i915_error_state_buf *m)
|
|
|
|
{
|
|
|
|
err_puts(m, ":");
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
struct compress {
|
|
|
|
};
|
|
|
|
|
|
|
|
static bool compress_init(struct compress *c)
|
2016-10-12 17:05:22 +08:00
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
static int compress_page(struct compress *c,
|
2016-10-12 17:05:22 +08:00
|
|
|
void *src,
|
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
|
|
|
unsigned long page;
|
2016-12-06 20:40:51 +08:00
|
|
|
void *ptr;
|
2016-10-12 17:05:22 +08:00
|
|
|
|
|
|
|
page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
ptr = (void *)page;
|
|
|
|
if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
|
|
|
|
memcpy(ptr, src, PAGE_SIZE);
|
|
|
|
dst->pages[dst->page_count++] = ptr;
|
2016-10-12 17:05:22 +08:00
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
static void compress_fini(struct compress *c,
|
2016-10-12 17:05:22 +08:00
|
|
|
struct drm_i915_error_object *dst)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void err_compression_marker(struct drm_i915_error_state_buf *m)
|
|
|
|
{
|
|
|
|
err_puts(m, "~");
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
static void print_error_buffers(struct drm_i915_error_state_buf *m,
|
|
|
|
const char *name,
|
|
|
|
struct drm_i915_error_buffer *err,
|
|
|
|
int count)
|
|
|
|
{
|
2015-04-27 20:41:17 +08:00
|
|
|
int i;
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
err_printf(m, "%s [%d]:\n", name, count);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
while (count--) {
|
2015-07-30 00:23:56 +08:00
|
|
|
err_printf(m, " %08x_%08x %8u %02x %02x [ ",
|
|
|
|
upper_32_bits(err->gtt_offset),
|
|
|
|
lower_32_bits(err->gtt_offset),
|
2013-07-12 21:50:57 +08:00
|
|
|
err->size,
|
|
|
|
err->read_domains,
|
2015-04-27 20:41:17 +08:00
|
|
|
err->write_domain);
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++)
|
2015-04-27 20:41:17 +08:00
|
|
|
err_printf(m, "%02x ", err->rseqno[i]);
|
|
|
|
|
|
|
|
err_printf(m, "] %02x", err->wseqno);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_puts(m, tiling_flag(err->tiling));
|
|
|
|
err_puts(m, dirty_flag(err->dirty));
|
|
|
|
err_puts(m, purgeable_flag(err->purgeable));
|
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl
By exporting the ability to map user address and inserting PTEs
representing their backing pages into the GTT, we can exploit UMA in order
to utilize normal application data as a texture source or even as a
render target (depending upon the capabilities of the chipset). This has
a number of uses, with zero-copy downloads to the GPU and efficient
readback making the intermixed streaming of CPU and GPU operations
fairly efficient. This ability has many widespread implications from
faster rendering of client-side software rasterisers (chromium),
mitigation of stalls due to read back (firefox) and to faster pipelining
of texture data (such as pixel buffer objects in GL or data blobs in CL).
v2: Compile with CONFIG_MMU_NOTIFIER
v3: We can sleep while performing invalidate-range, which we can utilise
to drop our page references prior to the kernel manipulating the vma
(for either discard or cloning) and so protect normal users.
v4: Only run the invalidate notifier if the range intercepts the bo.
v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers
v6: Recheck after reacquire mutex for lost mmu.
v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary.
v8: Fix rebasing error after forwarding porting the back port.
v9: Limit the userptr to page aligned entries. We now expect userspace
to handle all the offset-in-page adjustments itself.
v10: Prevent vma from being copied across fork to avoid issues with cow.
v11: Drop vma behaviour changes -- locking is nigh on impossible.
Use a worker to load user pages to avoid lock inversions.
v12: Use get_task_mm()/mmput() for correct refcounting of mm.
v13: Use a worker to release the mmu_notifier to avoid lock inversion
v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer
with its own locking and tree of objects for each mm/mmu_notifier.
v15: Prevent overlapping userptr objects, and invalidate all objects
within the mmu_notifier range
v16: Fix a typo for iterating over multiple objects in the range and
rearrange error path to destroy the mmu_notifier locklessly.
Also close a race between invalidate_range and the get_pages_worker.
v17: Close a race between get_pages_worker/invalidate_range and fresh
allocations of the same userptr range - and notice that
struct_mutex was presumed to be held when during creation it wasn't.
v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory
for the struct sg_table and to clear it before reporting an error.
v19: Always error out on read-only userptr requests as we don't have the
hardware infrastructure to support them at the moment.
v20: Refuse to implement read-only support until we have the required
infrastructure - but reserve the bit in flags for future use.
v21: use_mm() is not required for get_user_pages(). It is only meant to
be used to fix up the kernel thread's current->mm for use with
copy_user().
v22: Use sg_alloc_table_from_pages for that chunky feeling
v23: Export a function for sanity checking dma-buf rather than encode
userptr details elsewhere, and clean up comments based on
suggestions by Bradley.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Akash Goel <akash.goel@intel.com>
Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com>
[danvet: Frob ioctl allocation to pick the next one - will cause a bit
of fuss with create2 apparently, but such are the rules.]
[danvet2: oops, forgot to git add after manual patch application]
[danvet3: Appease sparse.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
|
|
|
err_puts(m, err->userptr ? " userptr" : "");
|
2016-07-27 16:07:28 +08:00
|
|
|
err_puts(m, err->engine != -1 ? " " : "");
|
|
|
|
err_puts(m, engine_str(err->engine));
|
2014-08-22 21:41:39 +08:00
|
|
|
err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
if (err->name)
|
|
|
|
err_printf(m, " (name: %d)", err->name);
|
|
|
|
if (err->fence_reg != I915_FENCE_REG_NONE)
|
|
|
|
err_printf(m, " (fence: %d)", err->fence_reg);
|
|
|
|
|
|
|
|
err_puts(m, "\n");
|
|
|
|
err++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-20 21:54:32 +08:00
|
|
|
static void error_print_instdone(struct drm_i915_error_state_buf *m,
|
2017-02-15 00:46:11 +08:00
|
|
|
const struct drm_i915_error_engine *ee)
|
2016-09-20 21:54:32 +08:00
|
|
|
{
|
2016-09-20 21:54:33 +08:00
|
|
|
int slice;
|
|
|
|
int subslice;
|
|
|
|
|
2016-09-20 21:54:32 +08:00
|
|
|
err_printf(m, " INSTDONE: 0x%08x\n",
|
|
|
|
ee->instdone.instdone);
|
|
|
|
|
|
|
|
if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3)
|
|
|
|
return;
|
|
|
|
|
|
|
|
err_printf(m, " SC_INSTDONE: 0x%08x\n",
|
|
|
|
ee->instdone.slice_common);
|
|
|
|
|
|
|
|
if (INTEL_GEN(m->i915) <= 6)
|
|
|
|
return;
|
|
|
|
|
2016-09-20 21:54:33 +08:00
|
|
|
for_each_instdone_slice_subslice(m->i915, slice, subslice)
|
|
|
|
err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
|
|
|
|
slice, subslice,
|
|
|
|
ee->instdone.sampler[slice][subslice]);
|
|
|
|
|
|
|
|
for_each_instdone_slice_subslice(m->i915, slice, subslice)
|
|
|
|
err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
|
|
|
|
slice, subslice,
|
|
|
|
ee->instdone.row[slice][subslice]);
|
2016-09-20 21:54:32 +08:00
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
static void error_print_request(struct drm_i915_error_state_buf *m,
|
|
|
|
const char *prefix,
|
2017-02-15 00:46:11 +08:00
|
|
|
const struct drm_i915_error_request *erq)
|
2016-10-13 18:18:14 +08:00
|
|
|
{
|
|
|
|
if (!erq->seqno)
|
|
|
|
return;
|
|
|
|
|
2016-11-16 23:20:32 +08:00
|
|
|
err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x, emitted %dms ago, head %08x, tail %08x\n",
|
|
|
|
prefix, erq->pid, erq->ban_score,
|
2016-10-13 18:18:14 +08:00
|
|
|
erq->context, erq->seqno,
|
|
|
|
jiffies_to_msecs(jiffies - erq->jiffies),
|
|
|
|
erq->head, erq->tail);
|
|
|
|
}
|
|
|
|
|
2017-01-29 17:24:33 +08:00
|
|
|
static void error_print_context(struct drm_i915_error_state_buf *m,
|
|
|
|
const char *header,
|
2017-02-15 00:46:11 +08:00
|
|
|
const struct drm_i915_error_context *ctx)
|
2017-01-29 17:24:33 +08:00
|
|
|
{
|
|
|
|
err_printf(m, "%s%s[%d] user_handle %d hw_id %d, ban score %d guilty %d active %d\n",
|
|
|
|
header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
|
|
|
|
ctx->ban_score, ctx->guilty, ctx->active);
|
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void error_print_engine(struct drm_i915_error_state_buf *m,
|
2017-02-15 00:46:11 +08:00
|
|
|
const struct drm_i915_error_engine *ee)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2017-09-22 20:43:07 +08:00
|
|
|
int n;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, "%s command stream:\n", engine_str(ee->engine_id));
|
|
|
|
err_printf(m, " START: 0x%08x\n", ee->start);
|
2016-10-13 18:18:15 +08:00
|
|
|
err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
|
2016-10-05 04:11:30 +08:00
|
|
|
err_printf(m, " TAIL: 0x%08x [0x%08x, 0x%08x]\n",
|
|
|
|
ee->tail, ee->rq_post, ee->rq_tail);
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " CTL: 0x%08x\n", ee->ctl);
|
2016-08-15 17:49:11 +08:00
|
|
|
err_printf(m, " MODE: 0x%08x\n", ee->mode);
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " HWS: 0x%08x\n", ee->hws);
|
|
|
|
err_printf(m, " ACTHD: 0x%08x %08x\n",
|
|
|
|
(u32)(ee->acthd>>32), (u32)ee->acthd);
|
|
|
|
err_printf(m, " IPEIR: 0x%08x\n", ee->ipeir);
|
|
|
|
err_printf(m, " IPEHR: 0x%08x\n", ee->ipehr);
|
2016-09-20 21:54:32 +08:00
|
|
|
|
|
|
|
error_print_instdone(m, ee);
|
|
|
|
|
2016-08-15 17:49:09 +08:00
|
|
|
if (ee->batchbuffer) {
|
|
|
|
u64 start = ee->batchbuffer->gtt_offset;
|
|
|
|
u64 end = start + ee->batchbuffer->gtt_size;
|
|
|
|
|
|
|
|
err_printf(m, " batch: [0x%08x_%08x, 0x%08x_%08x]\n",
|
|
|
|
upper_32_bits(start), lower_32_bits(start),
|
|
|
|
upper_32_bits(end), lower_32_bits(end));
|
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
if (INTEL_GEN(m->i915) >= 4) {
|
2016-08-15 17:49:09 +08:00
|
|
|
err_printf(m, " BBADDR: 0x%08x_%08x\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
(u32)(ee->bbaddr>>32), (u32)ee->bbaddr);
|
|
|
|
err_printf(m, " BB_STATE: 0x%08x\n", ee->bbstate);
|
|
|
|
err_printf(m, " INSTPS: 0x%08x\n", ee->instps);
|
2013-12-11 03:44:43 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " INSTPM: 0x%08x\n", ee->instpm);
|
|
|
|
err_printf(m, " FADDR: 0x%08x %08x\n", upper_32_bits(ee->faddr),
|
|
|
|
lower_32_bits(ee->faddr));
|
|
|
|
if (INTEL_GEN(m->i915) >= 6) {
|
|
|
|
err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
|
|
|
|
err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
|
2016-10-28 20:58:53 +08:00
|
|
|
err_printf(m, " SYNC_0: 0x%08x\n",
|
|
|
|
ee->semaphore_mboxes[0]);
|
|
|
|
err_printf(m, " SYNC_1: 0x%08x\n",
|
|
|
|
ee->semaphore_mboxes[1]);
|
|
|
|
if (HAS_VEBOX(m->i915))
|
|
|
|
err_printf(m, " SYNC_2: 0x%08x\n",
|
|
|
|
ee->semaphore_mboxes[2]);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
if (USES_PPGTT(m->i915)) {
|
|
|
|
err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
|
2014-01-30 16:19:40 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (INTEL_GEN(m->i915) >= 8) {
|
2014-01-30 16:19:40 +08:00
|
|
|
int i;
|
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
err_printf(m, " PDP%d: 0x%016llx\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
i, ee->vm_info.pdp[i]);
|
2014-01-30 16:19:40 +08:00
|
|
|
} else {
|
|
|
|
err_printf(m, " PP_DIR_BASE: 0x%08x\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pp_dir_base);
|
2014-01-30 16:19:40 +08:00
|
|
|
}
|
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
err_printf(m, " seqno: 0x%08x\n", ee->seqno);
|
|
|
|
err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno);
|
|
|
|
err_printf(m, " waiting: %s\n", yesno(ee->waiting));
|
|
|
|
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
|
|
|
|
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
|
2016-11-18 21:09:04 +08:00
|
|
|
err_printf(m, " hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
|
|
|
|
err_printf(m, " hangcheck action: %s\n",
|
|
|
|
hangcheck_action_to_str(ee->hangcheck_action));
|
|
|
|
err_printf(m, " hangcheck action timestamp: %lu, %u ms ago\n",
|
|
|
|
ee->hangcheck_timestamp,
|
|
|
|
jiffies_to_msecs(jiffies - ee->hangcheck_timestamp));
|
2017-06-20 17:57:48 +08:00
|
|
|
err_printf(m, " engine reset count: %u\n", ee->reset_count);
|
2016-11-18 21:09:04 +08:00
|
|
|
|
2017-09-22 20:43:07 +08:00
|
|
|
for (n = 0; n < ee->num_ports; n++) {
|
|
|
|
err_printf(m, " ELSP[%d]:", n);
|
|
|
|
error_print_request(m, " ", &ee->execlist[n]);
|
|
|
|
}
|
|
|
|
|
2017-01-29 17:24:33 +08:00
|
|
|
error_print_context(m, " Active context: ", &ee->context);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
|
|
|
|
{
|
|
|
|
va_list args;
|
|
|
|
|
|
|
|
va_start(args, f);
|
|
|
|
i915_error_vprintf(e, f, args);
|
|
|
|
va_end(args);
|
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
static int
|
|
|
|
ascii85_encode_len(int len)
|
|
|
|
{
|
|
|
|
return DIV_ROUND_UP(len, 4);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool
|
|
|
|
ascii85_encode(u32 in, char *out)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
if (in == 0)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
out[5] = '\0';
|
|
|
|
for (i = 5; i--; ) {
|
|
|
|
out[i] = '!' + in % 85;
|
|
|
|
in /= 85;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:24 +08:00
|
|
|
static void print_error_obj(struct drm_i915_error_state_buf *m,
|
2016-10-12 17:05:21 +08:00
|
|
|
struct intel_engine_cs *engine,
|
|
|
|
const char *name,
|
2014-02-25 23:11:24 +08:00
|
|
|
struct drm_i915_error_object *obj)
|
|
|
|
{
|
2016-10-12 17:05:22 +08:00
|
|
|
char out[6];
|
|
|
|
int page;
|
2014-02-25 23:11:24 +08:00
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
if (!obj)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (name) {
|
|
|
|
err_printf(m, "%s --- %s = 0x%08x %08x\n",
|
|
|
|
engine ? engine->name : "global", name,
|
|
|
|
upper_32_bits(obj->gtt_offset),
|
|
|
|
lower_32_bits(obj->gtt_offset));
|
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:22 +08:00
|
|
|
err_compression_marker(m);
|
|
|
|
for (page = 0; page < obj->page_count; page++) {
|
|
|
|
int i, len;
|
|
|
|
|
|
|
|
len = PAGE_SIZE;
|
|
|
|
if (page == obj->page_count - 1)
|
|
|
|
len -= obj->unused;
|
|
|
|
len = ascii85_encode_len(len);
|
|
|
|
|
|
|
|
for (i = 0; i < len; i++) {
|
|
|
|
if (ascii85_encode(obj->pages[page][i], out))
|
|
|
|
err_puts(m, out);
|
|
|
|
else
|
|
|
|
err_puts(m, "z");
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
}
|
2016-10-12 17:05:22 +08:00
|
|
|
err_puts(m, "\n");
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:45 +08:00
|
|
|
static void err_print_capabilities(struct drm_i915_error_state_buf *m,
|
|
|
|
const struct intel_device_info *info)
|
|
|
|
{
|
|
|
|
#define PRINT_FLAG(x) err_printf(m, #x ": %s\n", yesno(info->x))
|
2016-10-05 18:50:16 +08:00
|
|
|
DEV_INFO_FOR_EACH_FLAG(PRINT_FLAG);
|
2016-08-15 17:48:45 +08:00
|
|
|
#undef PRINT_FLAG
|
|
|
|
}
|
|
|
|
|
2017-02-07 05:36:07 +08:00
|
|
|
static __always_inline void err_print_param(struct drm_i915_error_state_buf *m,
|
|
|
|
const char *name,
|
|
|
|
const char *type,
|
|
|
|
const void *x)
|
|
|
|
{
|
|
|
|
if (!__builtin_strcmp(type, "bool"))
|
|
|
|
err_printf(m, "i915.%s=%s\n", name, yesno(*(const bool *)x));
|
|
|
|
else if (!__builtin_strcmp(type, "int"))
|
|
|
|
err_printf(m, "i915.%s=%d\n", name, *(const int *)x);
|
|
|
|
else if (!__builtin_strcmp(type, "unsigned int"))
|
|
|
|
err_printf(m, "i915.%s=%u\n", name, *(const unsigned int *)x);
|
2017-02-22 00:26:19 +08:00
|
|
|
else if (!__builtin_strcmp(type, "char *"))
|
|
|
|
err_printf(m, "i915.%s=%s\n", name, *(const char **)x);
|
2017-02-07 05:36:07 +08:00
|
|
|
else
|
|
|
|
BUILD_BUG();
|
|
|
|
}
|
|
|
|
|
|
|
|
static void err_print_params(struct drm_i915_error_state_buf *m,
|
|
|
|
const struct i915_params *p)
|
|
|
|
{
|
|
|
|
#define PRINT(T, x) err_print_param(m, #x, #T, &p->x);
|
|
|
|
I915_PARAMS_FOR_EACH(PRINT);
|
|
|
|
#undef PRINT
|
|
|
|
}
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
static void err_print_pciid(struct drm_i915_error_state_buf *m,
|
|
|
|
struct drm_i915_private *i915)
|
|
|
|
{
|
|
|
|
struct pci_dev *pdev = i915->drm.pdev;
|
|
|
|
|
|
|
|
err_printf(m, "PCI ID: 0x%04x\n", pdev->device);
|
|
|
|
err_printf(m, "PCI Revision: 0x%02x\n", pdev->revision);
|
|
|
|
err_printf(m, "PCI Subsystem: %04x:%04x\n",
|
|
|
|
pdev->subsystem_vendor,
|
|
|
|
pdev->subsystem_device);
|
|
|
|
}
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
int i915_error_state_to_str(struct drm_i915_error_state_buf *m,
|
2017-02-15 00:46:11 +08:00
|
|
|
const struct i915_gpu_state *error)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2017-02-15 00:46:11 +08:00
|
|
|
struct drm_i915_private *dev_priv = m->i915;
|
2014-07-01 00:53:41 +08:00
|
|
|
struct drm_i915_error_object *obj;
|
2016-10-12 17:05:21 +08:00
|
|
|
int i, j;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
if (!error) {
|
2017-02-15 00:46:11 +08:00
|
|
|
err_printf(m, "No error state collected\n");
|
|
|
|
return 0;
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
if (*error->error_msg)
|
|
|
|
err_printf(m, "%s\n", error->error_msg);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "Kernel: " UTS_RELEASE "\n");
|
2016-10-25 20:16:02 +08:00
|
|
|
err_printf(m, "Time: %ld s %ld us\n",
|
|
|
|
error->time.tv_sec, error->time.tv_usec);
|
|
|
|
err_printf(m, "Boottime: %ld s %ld us\n",
|
|
|
|
error->boottime.tv_sec, error->boottime.tv_usec);
|
|
|
|
err_printf(m, "Uptime: %ld s %ld us\n",
|
|
|
|
error->uptime.tv_sec, error->uptime.tv_usec);
|
2016-11-18 21:09:04 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
2016-11-18 21:09:04 +08:00
|
|
|
if (error->engine[i].hangcheck_stalled &&
|
2017-01-29 17:24:33 +08:00
|
|
|
error->engine[i].context.pid) {
|
|
|
|
err_printf(m, "Active process (on ring %s): %s [%d], score %d\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
engine_str(i),
|
2017-01-29 17:24:33 +08:00
|
|
|
error->engine[i].context.comm,
|
|
|
|
error->engine[i].context.pid,
|
|
|
|
error->engine[i].context.ban_score);
|
2014-02-25 23:11:24 +08:00
|
|
|
}
|
|
|
|
}
|
2014-02-25 23:11:27 +08:00
|
|
|
err_printf(m, "Reset count: %u\n", error->reset_count);
|
2014-02-25 23:11:28 +08:00
|
|
|
err_printf(m, "Suspend count: %u\n", error->suspend_count);
|
2016-12-01 20:49:55 +08:00
|
|
|
err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
|
2017-02-15 00:46:11 +08:00
|
|
|
err_print_pciid(m, error->i915);
|
2017-02-07 05:36:07 +08:00
|
|
|
|
2015-08-08 03:24:15 +08:00
|
|
|
err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
|
2015-10-29 21:21:19 +08:00
|
|
|
|
2016-11-07 17:29:20 +08:00
|
|
|
if (HAS_CSR(dev_priv)) {
|
2015-10-29 21:21:19 +08:00
|
|
|
struct intel_csr *csr = &dev_priv->csr;
|
|
|
|
|
|
|
|
err_printf(m, "DMC loaded: %s\n",
|
|
|
|
yesno(csr->dmc_payload != NULL));
|
|
|
|
err_printf(m, "DMC fw version: %d.%d\n",
|
|
|
|
CSR_VERSION_MAJOR(csr->version),
|
|
|
|
CSR_VERSION_MINOR(csr->version));
|
|
|
|
}
|
|
|
|
|
2017-03-02 23:03:56 +08:00
|
|
|
err_printf(m, "GT awake: %s\n", yesno(error->awake));
|
2017-03-02 23:15:44 +08:00
|
|
|
err_printf(m, "RPM wakelock: %s\n", yesno(error->wakelock));
|
|
|
|
err_printf(m, "PM suspended: %s\n", yesno(error->suspended));
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "EIR: 0x%08x\n", error->eir);
|
|
|
|
err_printf(m, "IER: 0x%08x\n", error->ier);
|
2017-02-15 00:46:11 +08:00
|
|
|
for (i = 0; i < error->ngtier; i++)
|
|
|
|
err_printf(m, "GTIER[%d]: 0x%08x\n", i, error->gtier[i]);
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "PGTBL_ER: 0x%08x\n", error->pgtbl_er);
|
|
|
|
err_printf(m, "FORCEWAKE: 0x%08x\n", error->forcewake);
|
|
|
|
err_printf(m, "DERRMR: 0x%08x\n", error->derrmr);
|
|
|
|
err_printf(m, "CCID: 0x%08x\n", error->ccid);
|
2013-09-26 00:34:55 +08:00
|
|
|
err_printf(m, "Missed interrupts: 0x%08lx\n", dev_priv->gpu_error.missed_irq_rings);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
for (i = 0; i < error->nfence; i++)
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
|
|
|
|
|
2016-11-16 16:55:37 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 6) {
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "ERROR: 0x%08x\n", error->error);
|
2015-03-24 20:54:19 +08:00
|
|
|
|
2016-11-16 16:55:37 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8)
|
2015-03-24 20:54:19 +08:00
|
|
|
err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
|
|
|
|
error->fault_data1, error->fault_data0);
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
|
|
|
|
}
|
|
|
|
|
2016-10-13 18:03:10 +08:00
|
|
|
if (IS_GEN7(dev_priv))
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
if (error->engine[i].engine_id != -1)
|
|
|
|
error_print_engine(m, &error->engine[i]);
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
|
|
|
|
char buf[128];
|
|
|
|
int len, first = 1;
|
2014-08-13 03:05:47 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
if (!error->active_vm[i])
|
|
|
|
break;
|
|
|
|
|
|
|
|
len = scnprintf(buf, sizeof(buf), "Active (");
|
|
|
|
for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
|
|
|
|
if (error->engine[j].vm != error->active_vm[i])
|
|
|
|
continue;
|
|
|
|
|
|
|
|
len += scnprintf(buf + len, sizeof(buf), "%s%s",
|
|
|
|
first ? "" : ", ",
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
dev_priv->engine[j]->name);
|
2016-08-15 17:48:41 +08:00
|
|
|
first = 0;
|
|
|
|
}
|
|
|
|
scnprintf(buf + len, sizeof(buf), ")");
|
|
|
|
print_error_buffers(m, buf,
|
2014-08-13 03:05:47 +08:00
|
|
|
error->active_bo[i],
|
|
|
|
error->active_bo_count[i]);
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
print_error_buffers(m, "Pinned (global)",
|
|
|
|
error->pinned_bo,
|
|
|
|
error->pinned_bo_count);
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
2017-02-15 00:46:11 +08:00
|
|
|
const struct drm_i915_error_engine *ee = &error->engine[i];
|
2016-07-27 16:07:28 +08:00
|
|
|
|
|
|
|
obj = ee->batchbuffer;
|
2014-02-25 23:11:24 +08:00
|
|
|
if (obj) {
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
err_puts(m, dev_priv->engine[i]->name);
|
2017-01-29 17:24:33 +08:00
|
|
|
if (ee->context.pid)
|
|
|
|
err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d)",
|
|
|
|
ee->context.comm,
|
|
|
|
ee->context.pid,
|
|
|
|
ee->context.handle,
|
|
|
|
ee->context.hw_id,
|
|
|
|
ee->context.ban_score);
|
2015-07-30 00:23:56 +08:00
|
|
|
err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
|
|
|
|
upper_32_bits(obj->gtt_offset),
|
|
|
|
lower_32_bits(obj->gtt_offset));
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
print_error_obj(m, dev_priv->engine[i], NULL, obj);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2017-04-15 17:39:02 +08:00
|
|
|
for (j = 0; j < ee->user_bo_count; j++)
|
|
|
|
print_error_obj(m, dev_priv->engine[i],
|
|
|
|
"user", ee->user_bo[j]);
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (ee->num_requests) {
|
2013-07-12 21:50:57 +08:00
|
|
|
err_printf(m, "%s --- %d requests\n",
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
dev_priv->engine[i]->name,
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->num_requests);
|
2016-10-13 18:18:14 +08:00
|
|
|
for (j = 0; j < ee->num_requests; j++)
|
|
|
|
error_print_request(m, " ", &ee->requests[j]);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
if (IS_ERR(ee->waiters)) {
|
|
|
|
err_printf(m, "%s --- ? waiters [unable to acquire spinlock]\n",
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
dev_priv->engine[i]->name);
|
2016-09-06 15:38:44 +08:00
|
|
|
} else if (ee->num_waiters) {
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
err_printf(m, "%s --- %d waiters\n",
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
dev_priv->engine[i]->name,
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->num_waiters);
|
|
|
|
for (j = 0; j < ee->num_waiters; j++) {
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
err_printf(m, " seqno 0x%08x for %s [%d]\n",
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->waiters[j].seqno,
|
|
|
|
ee->waiters[j].comm,
|
|
|
|
ee->waiters[j].pid);
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
print_error_obj(m, dev_priv->engine[i],
|
2016-10-12 17:05:21 +08:00
|
|
|
"ringbuffer", ee->ringbuffer);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
print_error_obj(m, dev_priv->engine[i],
|
2016-10-12 17:05:21 +08:00
|
|
|
"HW Status", ee->hws_page);
|
2015-09-16 01:03:01 +08:00
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
print_error_obj(m, dev_priv->engine[i],
|
2016-10-12 17:05:21 +08:00
|
|
|
"HW context", ee->ctx);
|
2014-01-24 06:40:36 +08:00
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
print_error_obj(m, dev_priv->engine[i],
|
2016-10-12 17:05:21 +08:00
|
|
|
"WA context", ee->wa_ctx);
|
2016-03-01 19:24:36 +08:00
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
print_error_obj(m, dev_priv->engine[i],
|
2016-10-12 17:05:21 +08:00
|
|
|
"WA batchbuffer", ee->wa_batchbuffer);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:21 +08:00
|
|
|
print_error_obj(m, NULL, "Semaphores", error->semaphore);
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2016-10-13 00:24:39 +08:00
|
|
|
print_error_obj(m, NULL, "GuC log buffer", error->guc_log);
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
if (error->overlay)
|
|
|
|
intel_overlay_print_error_state(m, error->overlay);
|
|
|
|
|
|
|
|
if (error->display)
|
2017-02-15 00:46:11 +08:00
|
|
|
intel_display_print_error_state(m, error->display);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2017-02-07 05:36:07 +08:00
|
|
|
err_print_capabilities(m, &error->device_info);
|
|
|
|
err_print_params(m, &error->params);
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
if (m->bytes == 0 && m->err)
|
|
|
|
return m->err;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int i915_error_state_buf_init(struct drm_i915_error_state_buf *ebuf,
|
2014-08-22 21:41:39 +08:00
|
|
|
struct drm_i915_private *i915,
|
2013-07-12 21:50:57 +08:00
|
|
|
size_t count, loff_t pos)
|
|
|
|
{
|
|
|
|
memset(ebuf, 0, sizeof(*ebuf));
|
2014-08-22 21:41:39 +08:00
|
|
|
ebuf->i915 = i915;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
/* We need to have enough room to store any i915_error_state printf
|
|
|
|
* so that we can move it to start position.
|
|
|
|
*/
|
|
|
|
ebuf->size = count + 1 > PAGE_SIZE ? count + 1 : PAGE_SIZE;
|
|
|
|
ebuf->buf = kmalloc(ebuf->size,
|
|
|
|
GFP_TEMPORARY | __GFP_NORETRY | __GFP_NOWARN);
|
|
|
|
|
|
|
|
if (ebuf->buf == NULL) {
|
|
|
|
ebuf->size = PAGE_SIZE;
|
|
|
|
ebuf->buf = kmalloc(ebuf->size, GFP_TEMPORARY);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ebuf->buf == NULL) {
|
|
|
|
ebuf->size = 128;
|
|
|
|
ebuf->buf = kmalloc(ebuf->size, GFP_TEMPORARY);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ebuf->buf == NULL)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
ebuf->start = pos;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void i915_error_object_free(struct drm_i915_error_object *obj)
|
|
|
|
{
|
|
|
|
int page;
|
|
|
|
|
|
|
|
if (obj == NULL)
|
|
|
|
return;
|
|
|
|
|
|
|
|
for (page = 0; page < obj->page_count; page++)
|
2016-10-12 17:05:20 +08:00
|
|
|
free_page((unsigned long)obj->pages[page]);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
kfree(obj);
|
|
|
|
}
|
|
|
|
|
2017-02-22 00:26:19 +08:00
|
|
|
static __always_inline void free_param(const char *type, void *x)
|
|
|
|
{
|
|
|
|
if (!__builtin_strcmp(type, "char *"))
|
|
|
|
kfree(*(void **)x);
|
|
|
|
}
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
void __i915_gpu_state_free(struct kref *error_ref)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error =
|
|
|
|
container_of(error_ref, typeof(*error), ref);
|
2017-04-15 17:39:02 +08:00
|
|
|
long i, j;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
|
|
|
|
2017-04-15 17:39:02 +08:00
|
|
|
for (j = 0; j < ee->user_bo_count; j++)
|
|
|
|
i915_error_object_free(ee->user_bo[j]);
|
|
|
|
kfree(ee->user_bo);
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
i915_error_object_free(ee->batchbuffer);
|
|
|
|
i915_error_object_free(ee->wa_batchbuffer);
|
|
|
|
i915_error_object_free(ee->ringbuffer);
|
|
|
|
i915_error_object_free(ee->hws_page);
|
|
|
|
i915_error_object_free(ee->ctx);
|
|
|
|
i915_error_object_free(ee->wa_ctx);
|
|
|
|
|
|
|
|
kfree(ee->requests);
|
2016-09-06 15:38:44 +08:00
|
|
|
if (!IS_ERR_OR_NULL(ee->waiters))
|
|
|
|
kfree(ee->waiters);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:49:02 +08:00
|
|
|
i915_error_object_free(error->semaphore);
|
2016-10-13 00:24:39 +08:00
|
|
|
i915_error_object_free(error->guc_log);
|
2015-03-20 17:41:03 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
|
2015-03-20 17:41:03 +08:00
|
|
|
kfree(error->active_bo[i]);
|
|
|
|
kfree(error->pinned_bo);
|
2016-08-15 17:48:41 +08:00
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
kfree(error->overlay);
|
|
|
|
kfree(error->display);
|
2017-02-22 00:26:19 +08:00
|
|
|
|
|
|
|
#define FREE(T, x) free_param(#T, &error->params.x);
|
|
|
|
I915_PARAMS_FOR_EACH(FREE);
|
|
|
|
#undef FREE
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
kfree(error);
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct drm_i915_error_object *
|
2016-10-12 17:05:20 +08:00
|
|
|
i915_error_object_create(struct drm_i915_private *i915,
|
2016-08-15 17:49:06 +08:00
|
|
|
struct i915_vma *vma)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-10-12 17:05:20 +08:00
|
|
|
struct i915_ggtt *ggtt = &i915->ggtt;
|
|
|
|
const u64 slot = ggtt->error_capture.start;
|
2013-07-12 21:50:57 +08:00
|
|
|
struct drm_i915_error_object *dst;
|
2016-12-06 20:40:51 +08:00
|
|
|
struct compress compress;
|
2016-10-12 17:05:20 +08:00
|
|
|
unsigned long num_pages;
|
|
|
|
struct sgt_iter iter;
|
|
|
|
dma_addr_t dma;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:49:06 +08:00
|
|
|
if (!vma)
|
|
|
|
return NULL;
|
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
|
2016-10-12 17:05:22 +08:00
|
|
|
num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
|
2016-10-12 17:05:20 +08:00
|
|
|
dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
|
|
|
|
GFP_ATOMIC | __GFP_NOWARN);
|
2016-08-15 17:49:06 +08:00
|
|
|
if (!dst)
|
2013-07-12 21:50:57 +08:00
|
|
|
return NULL;
|
|
|
|
|
2016-08-15 17:49:09 +08:00
|
|
|
dst->gtt_offset = vma->node.start;
|
|
|
|
dst->gtt_size = vma->node.size;
|
2016-10-12 17:05:20 +08:00
|
|
|
dst->page_count = 0;
|
2016-10-12 17:05:22 +08:00
|
|
|
dst->unused = 0;
|
|
|
|
|
2016-12-06 20:40:51 +08:00
|
|
|
if (!compress_init(&compress)) {
|
2016-10-12 17:05:22 +08:00
|
|
|
kfree(dst);
|
|
|
|
return NULL;
|
|
|
|
}
|
2016-08-15 17:49:09 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
for_each_sgt_dma(dma, iter, vma->pages) {
|
|
|
|
void __iomem *s;
|
|
|
|
int ret;
|
2014-08-13 03:05:48 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
ggtt->base.insert_page(&ggtt->base, dma, slot,
|
|
|
|
I915_CACHE_NONE, 0);
|
2014-08-13 03:05:48 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
s = io_mapping_map_atomic_wc(&ggtt->mappable, slot);
|
2016-12-06 20:40:51 +08:00
|
|
|
ret = compress_page(&compress, (void __force *)s, dst);
|
2016-10-12 17:05:20 +08:00
|
|
|
io_mapping_unmap_atomic(s);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-10-12 17:05:20 +08:00
|
|
|
if (ret)
|
2013-07-12 21:50:57 +08:00
|
|
|
goto unwind;
|
|
|
|
}
|
2016-10-12 17:05:20 +08:00
|
|
|
goto out;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
|
|
|
unwind:
|
2016-10-12 17:05:20 +08:00
|
|
|
while (dst->page_count--)
|
|
|
|
free_page((unsigned long)dst->pages[dst->page_count]);
|
2013-07-12 21:50:57 +08:00
|
|
|
kfree(dst);
|
2016-10-12 17:05:20 +08:00
|
|
|
dst = NULL;
|
|
|
|
|
|
|
|
out:
|
2016-12-06 20:40:51 +08:00
|
|
|
compress_fini(&compress, dst);
|
2016-10-13 20:02:40 +08:00
|
|
|
ggtt->base.clear_range(&ggtt->base, slot, PAGE_SIZE);
|
2016-10-12 17:05:20 +08:00
|
|
|
return dst;
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-08-04 14:52:31 +08:00
|
|
|
/* The error capture is special as tries to run underneath the normal
|
|
|
|
* locking rules - so we use the raw version of the i915_gem_active lookup.
|
|
|
|
*/
|
|
|
|
static inline uint32_t
|
|
|
|
__active_get_seqno(struct i915_gem_active *active)
|
|
|
|
{
|
2016-11-08 15:11:48 +08:00
|
|
|
struct drm_i915_gem_request *request;
|
|
|
|
|
|
|
|
request = __i915_gem_active_peek(active);
|
|
|
|
return request ? request->global_seqno : 0;
|
2016-08-04 14:52:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int
|
|
|
|
__active_get_engine_id(struct i915_gem_active *active)
|
|
|
|
{
|
2016-11-08 15:11:48 +08:00
|
|
|
struct drm_i915_gem_request *request;
|
2016-08-04 14:52:31 +08:00
|
|
|
|
2016-11-08 15:11:48 +08:00
|
|
|
request = __i915_gem_active_peek(active);
|
|
|
|
return request ? request->engine->id : -1;
|
2016-08-04 14:52:31 +08:00
|
|
|
}
|
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
static void capture_bo(struct drm_i915_error_buffer *err,
|
2014-08-13 03:05:47 +08:00
|
|
|
struct i915_vma *vma)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2014-08-13 03:05:47 +08:00
|
|
|
struct drm_i915_gem_object *obj = vma->obj;
|
2015-04-27 20:41:17 +08:00
|
|
|
int i;
|
2014-08-13 03:05:47 +08:00
|
|
|
|
2013-07-12 21:50:57 +08:00
|
|
|
err->size = obj->base.size;
|
|
|
|
err->name = obj->base.name;
|
2016-08-04 14:52:31 +08:00
|
|
|
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++)
|
drm/i915: Move GEM activity tracking into a common struct reservation_object
In preparation to support many distinct timelines, we need to expand the
activity tracking on the GEM object to handle more than just a request
per engine. We already use the struct reservation_object on the dma-buf
to handle many fence contexts, so integrating that into the GEM object
itself is the preferred solution. (For example, we can now share the same
reservation_object between every consumer/producer using this buffer and
skip the manual import/export via dma-buf.)
v2: Reimplement busy-ioctl (by walking the reservation object), postpone
the ABI change for another day. Similarly use the reservation object to
find the last_write request (if active and from i915) for choosing
display CS flips.
Caveats:
* busy-ioctl: busy-ioctl only reports on the native fences, it will not
warn of stalls (in set-domain-ioctl, pread/pwrite etc) if the object is
being rendered to by external fences. It also will not report the same
busy state as wait-ioctl (or polling on the dma-buf) in the same
circumstances. On the plus side, it does retain reporting of which
*i915* engines are engaged with this object.
* non-blocking atomic modesets take a step backwards as the wait for
render completion blocks the ioctl. This is fixed in a subsequent
patch to use a fence instead for awaiting on the rendering, see
"drm/i915: Restore nonblocking awaits for modesetting"
* dynamic array manipulation for shared-fences in reservation is slower
than the previous lockless static assignment (e.g. gem_exec_lut_handle
runtime on ivb goes from 42s to 66s), mainly due to atomic operations
(maintaining the fence refcounts).
* loss of object-level retirement callbacks, emulated by VMA retirement
tracking.
* minor loss of object-level last activity information from debugfs,
could be replaced with per-vma information if desired
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-21-chris@chris-wilson.co.uk
2016-10-28 20:58:44 +08:00
|
|
|
err->rseqno[i] = __active_get_seqno(&vma->last_read[i]);
|
2016-11-17 03:07:04 +08:00
|
|
|
err->wseqno = __active_get_seqno(&obj->frontbuffer_write);
|
|
|
|
err->engine = __active_get_engine_id(&obj->frontbuffer_write);
|
2016-08-04 14:52:31 +08:00
|
|
|
|
2014-08-13 03:05:47 +08:00
|
|
|
err->gtt_offset = vma->node.start;
|
2013-07-12 21:50:57 +08:00
|
|
|
err->read_domains = obj->base.read_domains;
|
|
|
|
err->write_domain = obj->base.write_domain;
|
2016-08-19 00:17:00 +08:00
|
|
|
err->fence_reg = vma->fence ? vma->fence->id : -1;
|
2016-08-05 17:14:23 +08:00
|
|
|
err->tiling = i915_gem_object_get_tiling(obj);
|
2016-10-28 20:58:35 +08:00
|
|
|
err->dirty = obj->mm.dirty;
|
|
|
|
err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
|
drm/i915: Introduce mapping of user pages into video memory (userptr) ioctl
By exporting the ability to map user address and inserting PTEs
representing their backing pages into the GTT, we can exploit UMA in order
to utilize normal application data as a texture source or even as a
render target (depending upon the capabilities of the chipset). This has
a number of uses, with zero-copy downloads to the GPU and efficient
readback making the intermixed streaming of CPU and GPU operations
fairly efficient. This ability has many widespread implications from
faster rendering of client-side software rasterisers (chromium),
mitigation of stalls due to read back (firefox) and to faster pipelining
of texture data (such as pixel buffer objects in GL or data blobs in CL).
v2: Compile with CONFIG_MMU_NOTIFIER
v3: We can sleep while performing invalidate-range, which we can utilise
to drop our page references prior to the kernel manipulating the vma
(for either discard or cloning) and so protect normal users.
v4: Only run the invalidate notifier if the range intercepts the bo.
v5: Prevent userspace from attempting to GTT mmap non-page aligned buffers
v6: Recheck after reacquire mutex for lost mmu.
v7: Fix implicit padding of ioctl struct by rounding to next 64bit boundary.
v8: Fix rebasing error after forwarding porting the back port.
v9: Limit the userptr to page aligned entries. We now expect userspace
to handle all the offset-in-page adjustments itself.
v10: Prevent vma from being copied across fork to avoid issues with cow.
v11: Drop vma behaviour changes -- locking is nigh on impossible.
Use a worker to load user pages to avoid lock inversions.
v12: Use get_task_mm()/mmput() for correct refcounting of mm.
v13: Use a worker to release the mmu_notifier to avoid lock inversion
v14: Decouple mmu_notifier from struct_mutex using a custom mmu_notifer
with its own locking and tree of objects for each mm/mmu_notifier.
v15: Prevent overlapping userptr objects, and invalidate all objects
within the mmu_notifier range
v16: Fix a typo for iterating over multiple objects in the range and
rearrange error path to destroy the mmu_notifier locklessly.
Also close a race between invalidate_range and the get_pages_worker.
v17: Close a race between get_pages_worker/invalidate_range and fresh
allocations of the same userptr range - and notice that
struct_mutex was presumed to be held when during creation it wasn't.
v18: Sigh. Fix the refactor of st_set_pages() to allocate enough memory
for the struct sg_table and to clear it before reporting an error.
v19: Always error out on read-only userptr requests as we don't have the
hardware infrastructure to support them at the moment.
v20: Refuse to implement read-only support until we have the required
infrastructure - but reserve the bit in flags for future use.
v21: use_mm() is not required for get_user_pages(). It is only meant to
be used to fix up the kernel thread's current->mm for use with
copy_user().
v22: Use sg_alloc_table_from_pages for that chunky feeling
v23: Export a function for sanity checking dma-buf rather than encode
userptr details elsewhere, and clean up comments based on
suggestions by Bradley.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Akash Goel <akash.goel@intel.com>
Cc: "Volkin, Bradley D" <bradley.d.volkin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Reviewed-by: Brad Volkin <bradley.d.volkin@intel.com>
[danvet: Frob ioctl allocation to pick the next one - will cause a bit
of fuss with create2 apparently, but such are the rules.]
[danvet2: oops, forgot to git add after manual patch application]
[danvet3: Appease sparse.]
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-05-16 21:22:37 +08:00
|
|
|
err->userptr = obj->userptr.mm != NULL;
|
2013-07-12 21:50:57 +08:00
|
|
|
err->cache_level = obj->cache_level;
|
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
static u32 capture_error_bo(struct drm_i915_error_buffer *err,
|
|
|
|
int count, struct list_head *head,
|
|
|
|
bool pinned_only)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2013-08-01 08:00:14 +08:00
|
|
|
struct i915_vma *vma;
|
2013-07-12 21:50:57 +08:00
|
|
|
int i = 0;
|
|
|
|
|
2016-02-26 19:03:19 +08:00
|
|
|
list_for_each_entry(vma, head, vm_link) {
|
2016-08-15 17:48:41 +08:00
|
|
|
if (pinned_only && !i915_vma_is_pinned(vma))
|
|
|
|
continue;
|
|
|
|
|
2014-08-13 03:05:47 +08:00
|
|
|
capture_bo(err++, vma);
|
2013-07-12 21:50:57 +08:00
|
|
|
if (++i == count)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
2014-02-04 20:18:55 +08:00
|
|
|
/* Generate a semi-unique error code. The code is not meant to have meaning, The
|
|
|
|
* code's only purpose is to try to prevent false duplicated bug reports by
|
|
|
|
* grossly estimating a GPU error state.
|
|
|
|
*
|
|
|
|
* TODO Ideally, hashing the batchbuffer would be a very nice way to determine
|
|
|
|
* the hang if we could strip the GTT offset information from it.
|
|
|
|
*
|
|
|
|
* It's only a small step better than a random number in its current form.
|
|
|
|
*/
|
|
|
|
static uint32_t i915_error_generate_code(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error,
|
2016-07-27 16:07:28 +08:00
|
|
|
int *engine_id)
|
2014-02-04 20:18:55 +08:00
|
|
|
{
|
|
|
|
uint32_t error_code = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* IPEHR would be an ideal way to detect errors, as it's the gross
|
|
|
|
* measure of "the command that hung." However, has some very common
|
|
|
|
* synchronization commands which almost always appear in the case
|
|
|
|
* strictly a client bug. Use instdone to differentiate those some.
|
|
|
|
*/
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++) {
|
2016-11-18 21:09:04 +08:00
|
|
|
if (error->engine[i].hangcheck_stalled) {
|
2016-07-27 16:07:28 +08:00
|
|
|
if (engine_id)
|
|
|
|
*engine_id = i;
|
2014-02-25 23:11:25 +08:00
|
|
|
|
2016-09-20 21:54:32 +08:00
|
|
|
return error->engine[i].ipehr ^
|
|
|
|
error->engine[i].instdone.instdone;
|
2014-02-25 23:11:25 +08:00
|
|
|
}
|
|
|
|
}
|
2014-02-04 20:18:55 +08:00
|
|
|
|
|
|
|
return error_code;
|
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
static void i915_gem_record_fences(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 6) {
|
2014-12-04 22:48:10 +08:00
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
2017-02-15 00:46:11 +08:00
|
|
|
error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
|
|
|
|
} else if (INTEL_GEN(dev_priv) >= 4) {
|
2015-09-21 23:05:14 +08:00
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
|
|
|
error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
|
2017-02-15 00:46:11 +08:00
|
|
|
} else {
|
2015-09-21 23:05:14 +08:00
|
|
|
for (i = 0; i < dev_priv->num_fence_regs; i++)
|
2017-02-15 00:46:11 +08:00
|
|
|
error->fence[i] = I915_READ(FENCE_REG(i));
|
2015-09-21 23:05:14 +08:00
|
|
|
}
|
2017-02-15 00:46:11 +08:00
|
|
|
error->nfence = i;
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-28 20:58:53 +08:00
|
|
|
static inline u32
|
|
|
|
gen8_engine_sync_index(struct intel_engine_cs *engine,
|
|
|
|
struct intel_engine_cs *other)
|
|
|
|
{
|
|
|
|
int idx;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
|
|
|
|
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
|
|
|
|
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
|
|
|
|
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
|
|
|
|
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
|
|
|
|
*/
|
|
|
|
|
|
|
|
idx = (other - engine) - 1;
|
|
|
|
if (idx < 0)
|
|
|
|
idx += I915_NUM_ENGINES;
|
|
|
|
|
|
|
|
return idx;
|
|
|
|
}
|
2014-07-01 00:53:40 +08:00
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
static void gen8_record_semaphore_state(struct i915_gpu_state *error,
|
2016-03-16 19:00:37 +08:00
|
|
|
struct intel_engine_cs *engine,
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_error_engine *ee)
|
2014-07-01 00:53:41 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
struct intel_engine_cs *to;
|
2016-03-24 02:19:53 +08:00
|
|
|
enum intel_engine_id id;
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2016-08-15 17:49:02 +08:00
|
|
|
if (!error->semaphore)
|
2016-07-27 16:07:28 +08:00
|
|
|
return;
|
2014-07-01 00:53:41 +08:00
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
for_each_engine(to, dev_priv, id) {
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
int idx;
|
|
|
|
u16 signal_offset;
|
|
|
|
u32 *tmp;
|
2014-07-01 00:53:41 +08:00
|
|
|
|
2016-03-16 19:00:37 +08:00
|
|
|
if (engine == to)
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
continue;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
signal_offset =
|
|
|
|
(GEN8_SIGNAL_OFFSET(engine, id) & (PAGE_SIZE - 1)) / 4;
|
2016-08-15 17:49:02 +08:00
|
|
|
tmp = error->semaphore->pages[0];
|
2016-10-28 20:58:53 +08:00
|
|
|
idx = gen8_engine_sync_index(engine, to);
|
drm/i915: Fix possible overflow when recording semaphore states.
semaphore _sync_seqno, _seqno and _mbox are smaller than number of rings.
This optimization is to remove the ring itself from the list and the logic to do that
is at intel_ring_sync_index as below:
/*
* rcs -> 0 = vcs, 1 = bcs, 2 = vecs, 3 = vcs2;
* vcs -> 0 = bcs, 1 = vecs, 2 = vcs2, 3 = rcs;
* bcs -> 0 = vecs, 1 = vcs2. 2 = rcs, 3 = vcs;
* vecs -> 0 = vcs2, 1 = rcs, 2 = vcs, 3 = bcs;
* vcs2 -> 0 = rcs, 1 = vcs, 2 = bcs, 3 = vecs;
*/
v2: Skip when from == to (Damien).
v3: avoid computing idx when from == to (Damien).
use ring == to instead of ring->id == to->id (Damien).
use continue instead of return (Rodrigo).
v4: avoid all unecessary computation (Damien).
reduce idx to loop scope (Damien).
Cc: Damien Lespiau <damien.lespiau@intel.com>
Cc: Ben Widawsky <benjamin.widawsky@intel.com>
Signed-off-by: Rodrigo Vivi <rodrigo.vivi@intel.com>
Reviewed-by: Damien Lespiau <damien.lespiau@intel.com>
Reviewed-by: Ben Widawsky <ben@bwidawsk.net>
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
2014-07-18 17:19:40 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[idx] = tmp[signal_offset];
|
2014-07-01 00:53:41 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
2014-07-01 00:53:40 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
|
|
|
|
|
|
|
ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base));
|
|
|
|
ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base));
|
2016-10-28 20:58:53 +08:00
|
|
|
if (HAS_VEBOX(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->semaphore_mboxes[2] =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(RING_SYNC_2(engine->mmio_base));
|
2014-07-01 00:53:40 +08:00
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
static void error_record_engine_waiters(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
{
|
|
|
|
struct intel_breadcrumbs *b = &engine->breadcrumbs;
|
|
|
|
struct drm_i915_error_waiter *waiter;
|
|
|
|
struct rb_node *rb;
|
|
|
|
int count;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->num_waiters = 0;
|
|
|
|
ee->waiters = NULL;
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
if (RB_EMPTY_ROOT(&b->waiters))
|
|
|
|
return;
|
|
|
|
|
2017-03-04 03:08:24 +08:00
|
|
|
if (!spin_trylock_irq(&b->rb_lock)) {
|
2016-09-06 15:38:44 +08:00
|
|
|
ee->waiters = ERR_PTR(-EDEADLK);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
count = 0;
|
|
|
|
for (rb = rb_first(&b->waiters); rb != NULL; rb = rb_next(rb))
|
|
|
|
count++;
|
2017-03-04 03:08:24 +08:00
|
|
|
spin_unlock_irq(&b->rb_lock);
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
|
|
|
|
waiter = NULL;
|
|
|
|
if (count)
|
|
|
|
waiter = kmalloc_array(count,
|
|
|
|
sizeof(struct drm_i915_error_waiter),
|
|
|
|
GFP_ATOMIC);
|
|
|
|
if (!waiter)
|
|
|
|
return;
|
|
|
|
|
2017-03-04 03:08:24 +08:00
|
|
|
if (!spin_trylock_irq(&b->rb_lock)) {
|
2016-09-06 15:38:44 +08:00
|
|
|
kfree(waiter);
|
|
|
|
ee->waiters = ERR_PTR(-EDEADLK);
|
|
|
|
return;
|
|
|
|
}
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
|
2016-09-06 15:38:44 +08:00
|
|
|
ee->waiters = waiter;
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
|
2017-01-20 22:36:55 +08:00
|
|
|
struct intel_wait *w = rb_entry(rb, typeof(*w), node);
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
|
|
|
|
strcpy(waiter->comm, w->tsk->comm);
|
|
|
|
waiter->pid = w->tsk->pid;
|
|
|
|
waiter->seqno = w->seqno;
|
|
|
|
waiter++;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
if (++ee->num_waiters == count)
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
break;
|
|
|
|
}
|
2017-03-04 03:08:24 +08:00
|
|
|
spin_unlock_irq(&b->rb_lock);
|
drm/i915: Slaughter the thundering i915_wait_request herd
One particularly stressful scenario consists of many independent tasks
all competing for GPU time and waiting upon the results (e.g. realtime
transcoding of many, many streams). One bottleneck in particular is that
each client waits on its own results, but every client is woken up after
every batchbuffer - hence the thunder of hooves as then every client must
do its heavyweight dance to read a coherent seqno to see if it is the
lucky one.
Ideally, we only want one client to wake up after the interrupt and
check its request for completion. Since the requests must retire in
order, we can select the first client on the oldest request to be woken.
Once that client has completed his wait, we can then wake up the
next client and so on. However, all clients then incur latency as every
process in the chain may be delayed for scheduling - this may also then
cause some priority inversion. To reduce the latency, when a client
is added or removed from the list, we scan the tree for completed
seqno and wake up all the completed waiters in parallel.
Using igt/benchmarks/gem_latency, we can demonstrate this effect. The
benchmark measures the number of GPU cycles between completion of a
batch and the client waking up from a call to wait-ioctl. With many
concurrent waiters, with each on a different request, we observe that
the wakeup latency before the patch scales nearly linearly with the
number of waiters (before external factors kick in making the scaling much
worse). After applying the patch, we can see that only the single waiter
for the request is being woken up, providing a constant wakeup latency
for every operation. However, the situation is not quite as rosy for
many waiters on the same request, though to the best of my knowledge this
is much less likely in practice. Here, we can observe that the
concurrent waiters incur extra latency from being woken up by the
solitary bottom-half, rather than directly by the interrupt. This
appears to be scheduler induced (having discounted adverse effects from
having a rbtree walk/erase in the wakeup path), each additional
wake_up_process() costs approximately 1us on big core. Another effect of
performing the secondary wakeups from the first bottom-half is the
incurred delay this imposes on high priority threads - rather than
immediately returning to userspace and leaving the interrupt handler to
wake the others.
To offset the delay incurred with additional waiters on a request, we
could use a hybrid scheme that did a quick read in the interrupt handler
and dequeued all the completed waiters (incurring the overhead in the
interrupt handler, not the best plan either as we then incur GPU
submission latency) but we would still have to wake up the bottom-half
every time to do the heavyweight slow read. Or we could only kick the
waiters on the seqno with the same priority as the current task (i.e. in
the realtime waiter scenario, only it is woken up immediately by the
interrupt and simply queues the next waiter before returning to userspace,
minimising its delay at the expense of the chain, and also reducing
contention on its scheduler runqueue). This is effective at avoid long
pauses in the interrupt handler and at avoiding the extra latency in
realtime/high-priority waiters.
v2: Convert from a kworker per engine into a dedicated kthread for the
bottom-half.
v3: Rename request members and tweak comments.
v4: Use a per-engine spinlock in the breadcrumbs bottom-half.
v5: Fix race in locklessly checking waiter status and kicking the task on
adding a new waiter.
v6: Fix deciding when to force the timer to hide missing interrupts.
v7: Move the bottom-half from the kthread to the first client process.
v8: Reword a few comments
v9: Break the busy loop when the interrupt is unmasked or has fired.
v10: Comments, unnecessary churn, better debugging from Tvrtko
v11: Wake all completed waiters on removing the current bottom-half to
reduce the latency of waking up a herd of clients all waiting on the
same request.
v12: Rearrange missed-interrupt fault injection so that it works with
igt/drv_missed_irq_hang
v13: Rename intel_breadcrumb and friends to intel_wait in preparation
for signal handling.
v14: RCU commentary, assert_spin_locked
v15: Hide BUG_ON behind the compiler; report on gem_latency findings.
v16: Sort seqno-groups by priority so that first-waiter has the highest
task priority (and so avoid priority inversion).
v17: Add waiters to post-mortem GPU hang state.
v18: Return early for a completed wait after acquiring the spinlock.
Avoids adding ourselves to the tree if the is already complete, and
skips the awkward question of why we don't do completion wakeups for
waits earlier than or equal to ourselves.
v19: Prepare for init_breadcrumbs to fail. Later patches may want to
allocate during init, so be prepared to propagate back the error code.
Testcase: igt/gem_concurrent_blit
Testcase: igt/benchmarks/gem_latency
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: "Rogozhkin, Dmitry V" <dmitry.v.rogozhkin@intel.com>
Cc: "Gong, Zhipeng" <zhipeng.gong@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com>
Cc: Dave Gordon <david.s.gordon@intel.com>
Cc: "Goel, Akash" <akash.goel@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@linux.intel.com> #v18
Link: http://patchwork.freedesktop.org/patch/msgid/1467390209-3576-6-git-send-email-chris@chris-wilson.co.uk
2016-07-02 00:23:15 +08:00
|
|
|
}
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
static void error_record_engine_registers(struct i915_gpu_state *error,
|
2016-07-27 16:07:28 +08:00
|
|
|
struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_private *dev_priv = engine->i915;
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 6) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base));
|
|
|
|
ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8)
|
2016-07-27 16:07:28 +08:00
|
|
|
gen8_record_semaphore_state(error, engine, ee);
|
2014-07-01 00:53:41 +08:00
|
|
|
else
|
2016-07-27 16:07:28 +08:00
|
|
|
gen6_record_semaphore_state(engine, ee);
|
2013-08-13 07:53:04 +08:00
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 4) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base));
|
|
|
|
ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base));
|
|
|
|
ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
|
|
|
|
ee->instps = I915_READ(RING_INSTPS(engine->mmio_base));
|
|
|
|
ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base));
|
2016-05-06 22:40:21 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32;
|
|
|
|
ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32;
|
2014-04-02 07:31:07 +08:00
|
|
|
}
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base));
|
2013-07-12 21:50:57 +08:00
|
|
|
} else {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->faddr = I915_READ(DMA_FADD_I8XX);
|
|
|
|
ee->ipeir = I915_READ(IPEIR);
|
|
|
|
ee->ipehr = I915_READ(IPEHR);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:17 +08:00
|
|
|
intel_engine_get_instdone(engine, &ee->instdone);
|
2016-09-20 21:54:32 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->waiting = intel_engine_has_waiter(engine);
|
|
|
|
ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
|
2016-08-03 05:50:21 +08:00
|
|
|
ee->acthd = intel_engine_get_active_head(engine);
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->seqno = intel_engine_get_seqno(engine);
|
2016-11-01 18:03:16 +08:00
|
|
|
ee->last_seqno = intel_engine_last_submit(engine);
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->start = I915_READ_START(engine);
|
|
|
|
ee->head = I915_READ_HEAD(engine);
|
|
|
|
ee->tail = I915_READ_TAIL(engine);
|
|
|
|
ee->ctl = I915_READ_CTL(engine);
|
2016-08-15 17:49:11 +08:00
|
|
|
if (INTEL_GEN(dev_priv) > 2)
|
|
|
|
ee->mode = I915_READ_MODE(engine);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-18 03:30:56 +08:00
|
|
|
if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
|
drm/i915: Type safe register read/write
Make I915_READ and I915_WRITE more type safe by wrapping the register
offset in a struct. This should eliminate most of the fumbles we've had
with misplaced parens.
This only takes care of normal mmio registers. We could extend the idea
to other register types and define each with its own struct. That way
you wouldn't be able to accidentally pass the wrong thing to a specific
register access function.
The gpio_reg setup is probably the ugliest thing left. But I figure I'd
just leave it for now, and wait for some divine inspiration to strike
before making it nice.
As for the generated code, it's actually a bit better sometimes. Eg.
looking at i915_irq_handler(), we can see the following change:
lea 0x70024(%rdx,%rax,1),%r9d
mov $0x1,%edx
- movslq %r9d,%r9
- mov %r9,%rsi
- mov %r9,-0x58(%rbp)
- callq *0xd8(%rbx)
+ mov %r9d,%esi
+ mov %r9d,-0x48(%rbp)
callq *0xd8(%rbx)
So previously gcc thought the register offset might be signed and
decided to sign extend it, just in case. The rest appears to be
mostly just minor shuffling of instructions.
v2: i915_mmio_reg_{offset,equal,valid}() helpers added
s/_REG/_MMIO/ in the register defines
mo more switch statements left to worry about
ring_emit stuff got sorted in a prep patch
cmd parser, lrc context and w/a batch buildup also in prep patch
vgpu stuff cleaned up and moved to a prep patch
all other unrelated changes split out
v3: Rebased due to BXT DSI/BLC, MOCS, etc.
v4: Rebased due to churn, s/i915_mmio_reg_t/i915_reg_t/
Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk>
Link: http://patchwork.freedesktop.org/patch/msgid/1447853606-2751-1-git-send-email-ville.syrjala@linux.intel.com
2015-11-18 21:33:26 +08:00
|
|
|
i915_reg_t mmio;
|
2014-01-24 06:40:36 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (IS_GEN7(dev_priv)) {
|
2016-03-16 19:00:37 +08:00
|
|
|
switch (engine->id) {
|
2014-01-24 06:40:36 +08:00
|
|
|
default:
|
|
|
|
case RCS:
|
|
|
|
mmio = RENDER_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
case BCS:
|
|
|
|
mmio = BLT_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
case VCS:
|
|
|
|
mmio = BSD_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
case VECS:
|
|
|
|
mmio = VEBOX_HWS_PGA_GEN7;
|
|
|
|
break;
|
|
|
|
}
|
2016-05-06 22:40:21 +08:00
|
|
|
} else if (IS_GEN6(engine->i915)) {
|
2016-03-16 19:00:37 +08:00
|
|
|
mmio = RING_HWS_PGA_GEN6(engine->mmio_base);
|
2014-01-24 06:40:36 +08:00
|
|
|
} else {
|
|
|
|
/* XXX: gen8 returns to sanity */
|
2016-03-16 19:00:37 +08:00
|
|
|
mmio = RING_HWS_PGA(engine->mmio_base);
|
2014-01-24 06:40:36 +08:00
|
|
|
}
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->hws = I915_READ(mmio);
|
2014-01-24 06:40:36 +08:00
|
|
|
}
|
|
|
|
|
2016-11-18 21:09:04 +08:00
|
|
|
ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->hangcheck_action = engine->hangcheck.action;
|
2016-11-18 21:09:04 +08:00
|
|
|
ee->hangcheck_stalled = engine->hangcheck.stalled;
|
2017-06-20 17:57:48 +08:00
|
|
|
ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
|
|
|
|
engine);
|
2014-01-30 16:19:40 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (USES_PPGTT(dev_priv)) {
|
2014-01-30 16:19:40 +08:00
|
|
|
int i;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
|
2014-01-30 16:19:40 +08:00
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
if (IS_GEN6(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pp_dir_base =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(RING_PP_DIR_BASE_READ(engine));
|
2016-05-06 22:40:21 +08:00
|
|
|
else if (IS_GEN7(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pp_dir_base =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(RING_PP_DIR_BASE(engine));
|
2016-05-06 22:40:21 +08:00
|
|
|
else if (INTEL_GEN(dev_priv) >= 8)
|
2014-01-30 16:19:40 +08:00
|
|
|
for (i = 0; i < 4; i++) {
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pdp[i] =
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(GEN8_RING_PDP_UDW(engine, i));
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->vm_info.pdp[i] <<= 32;
|
|
|
|
ee->vm_info.pdp[i] |=
|
2016-03-16 19:00:37 +08:00
|
|
|
I915_READ(GEN8_RING_PDP_LDW(engine, i));
|
2014-01-30 16:19:40 +08:00
|
|
|
}
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
static void record_request(struct drm_i915_gem_request *request,
|
|
|
|
struct drm_i915_error_request *erq)
|
|
|
|
{
|
|
|
|
erq->context = request->ctx->hw_id;
|
2017-07-21 20:32:30 +08:00
|
|
|
erq->ban_score = atomic_read(&request->ctx->ban_score);
|
2016-10-28 20:58:49 +08:00
|
|
|
erq->seqno = request->global_seqno;
|
2016-10-13 18:18:14 +08:00
|
|
|
erq->jiffies = request->emitted_jiffies;
|
|
|
|
erq->head = request->head;
|
|
|
|
erq->tail = request->tail;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
erq->pid = request->ctx->pid ? pid_nr(request->ctx->pid) : 0;
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
2016-08-15 17:49:10 +08:00
|
|
|
static void engine_record_requests(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_gem_request *first,
|
|
|
|
struct drm_i915_error_engine *ee)
|
|
|
|
{
|
|
|
|
struct drm_i915_gem_request *request;
|
|
|
|
int count;
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
request = first;
|
2016-10-28 20:58:46 +08:00
|
|
|
list_for_each_entry_from(request, &engine->timeline->requests, link)
|
2016-08-15 17:49:10 +08:00
|
|
|
count++;
|
|
|
|
if (!count)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
|
|
|
|
if (!ee->requests)
|
|
|
|
return;
|
|
|
|
|
|
|
|
ee->num_requests = count;
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
request = first;
|
2016-10-28 20:58:46 +08:00
|
|
|
list_for_each_entry_from(request, &engine->timeline->requests, link) {
|
2016-08-15 17:49:10 +08:00
|
|
|
if (count >= ee->num_requests) {
|
|
|
|
/*
|
|
|
|
* If the ring request list was changed in
|
|
|
|
* between the point where the error request
|
|
|
|
* list was created and dimensioned and this
|
|
|
|
* point then just exit early to avoid crashes.
|
|
|
|
*
|
|
|
|
* We don't need to communicate that the
|
|
|
|
* request list changed state during error
|
|
|
|
* state capture and that the error state is
|
|
|
|
* slightly incorrect as a consequence since we
|
|
|
|
* are typically only interested in the request
|
|
|
|
* list state at the point of error state
|
|
|
|
* capture, not in any changes happening during
|
|
|
|
* the capture.
|
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
record_request(request, &ee->requests[count++]);
|
2016-08-15 17:49:10 +08:00
|
|
|
}
|
|
|
|
ee->num_requests = count;
|
|
|
|
}
|
|
|
|
|
2016-10-13 18:18:14 +08:00
|
|
|
static void error_record_engine_execlists(struct intel_engine_cs *engine,
|
|
|
|
struct drm_i915_error_engine *ee)
|
|
|
|
{
|
2017-09-22 20:43:07 +08:00
|
|
|
const struct intel_engine_execlists * const execlists = &engine->execlists;
|
2016-10-13 18:18:14 +08:00
|
|
|
unsigned int n;
|
|
|
|
|
2017-09-22 20:43:07 +08:00
|
|
|
for (n = 0; n < execlists_num_ports(execlists); n++) {
|
|
|
|
struct drm_i915_gem_request *rq = port_request(&execlists->port[n]);
|
2017-05-17 20:10:00 +08:00
|
|
|
|
|
|
|
if (!rq)
|
|
|
|
break;
|
|
|
|
|
|
|
|
record_request(rq, &ee->execlist[n]);
|
|
|
|
}
|
2017-09-22 20:43:07 +08:00
|
|
|
|
|
|
|
ee->num_ports = n;
|
2016-10-13 18:18:14 +08:00
|
|
|
}
|
|
|
|
|
2017-01-29 17:24:33 +08:00
|
|
|
static void record_context(struct drm_i915_error_context *e,
|
|
|
|
struct i915_gem_context *ctx)
|
|
|
|
{
|
|
|
|
if (ctx->pid) {
|
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
task = pid_task(ctx->pid, PIDTYPE_PID);
|
|
|
|
if (task) {
|
|
|
|
strcpy(e->comm, task->comm);
|
|
|
|
e->pid = task->pid;
|
|
|
|
}
|
|
|
|
rcu_read_unlock();
|
|
|
|
}
|
|
|
|
|
|
|
|
e->handle = ctx->user_handle;
|
|
|
|
e->hw_id = ctx->hw_id;
|
2017-07-21 20:32:30 +08:00
|
|
|
e->ban_score = atomic_read(&ctx->ban_score);
|
|
|
|
e->guilty = atomic_read(&ctx->guilty_count);
|
|
|
|
e->active = atomic_read(&ctx->active_count);
|
2017-01-29 17:24:33 +08:00
|
|
|
}
|
|
|
|
|
2017-04-15 17:39:02 +08:00
|
|
|
static void request_record_user_bo(struct drm_i915_gem_request *request,
|
|
|
|
struct drm_i915_error_engine *ee)
|
|
|
|
{
|
|
|
|
struct i915_gem_capture_list *c;
|
|
|
|
struct drm_i915_error_object **bo;
|
|
|
|
long count;
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
for (c = request->capture_list; c; c = c->next)
|
|
|
|
count++;
|
|
|
|
|
|
|
|
bo = NULL;
|
|
|
|
if (count)
|
|
|
|
bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
|
|
|
|
if (!bo)
|
|
|
|
return;
|
|
|
|
|
|
|
|
count = 0;
|
|
|
|
for (c = request->capture_list; c; c = c->next) {
|
|
|
|
bo[count] = i915_error_object_create(request->i915, c->vma);
|
|
|
|
if (!bo[count])
|
|
|
|
break;
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
|
|
|
|
ee->user_bo = bo;
|
|
|
|
ee->user_bo_count = count;
|
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
static void i915_gem_record_rings(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-03-30 21:57:10 +08:00
|
|
|
struct i915_ggtt *ggtt = &dev_priv->ggtt;
|
2016-08-15 17:49:10 +08:00
|
|
|
int i;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:49:02 +08:00
|
|
|
error->semaphore =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv, dev_priv->semaphore);
|
2016-07-27 16:07:28 +08:00
|
|
|
|
2016-03-16 19:00:39 +08:00
|
|
|
for (i = 0; i < I915_NUM_ENGINES; i++) {
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
struct intel_engine_cs *engine = dev_priv->engine[i];
|
2016-07-27 16:07:28 +08:00
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
2016-08-15 17:49:10 +08:00
|
|
|
struct drm_i915_gem_request *request;
|
2014-01-27 21:52:34 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->engine_id = -1;
|
2014-06-10 19:09:29 +08:00
|
|
|
|
drm/i915: Allocate intel_engine_cs structure only for the enabled engines
With the possibility of addition of many more number of rings in future,
the drm_i915_private structure could bloat as an array, of type
intel_engine_cs, is embedded inside it.
struct intel_engine_cs engine[I915_NUM_ENGINES];
Though this is still fine as generally there is only a single instance of
drm_i915_private structure used, but not all of the possible rings would be
enabled or active on most of the platforms. Some memory can be saved by
allocating intel_engine_cs structure only for the enabled/active engines.
Currently the engine/ring ID is kept static and dev_priv->engine[] is simply
indexed using the enums defined in intel_engine_id.
To save memory and continue using the static engine/ring IDs, 'engine' is
defined as an array of pointers.
struct intel_engine_cs *engine[I915_NUM_ENGINES];
dev_priv->engine[engine_ID] will be NULL for disabled engine instances.
There is a text size reduction of 928 bytes, from 1028200 to 1027272, for
i915.o file (but for i915.ko file text size remain same as 1193131 bytes).
v2:
- Remove the engine iterator field added in drm_i915_private structure,
instead pass a local iterator variable to the for_each_engine**
macros. (Chris)
- Do away with intel_engine_initialized() and instead directly use the
NULL pointer check on engine pointer. (Chris)
v3:
- Remove for_each_engine_id() macro, as the updated macro for_each_engine()
can be used in place of it. (Chris)
- Protect the access to Render engine Fault register with a NULL check, as
engine specific init is done later in Driver load sequence.
v4:
- Use !!dev_priv->engine[VCS] style for the engine check in getparam. (Chris)
- Kill the superfluous init_engine_lists().
v5:
- Cleanup the intel_engines_init() & intel_engines_setup(), with respect to
allocation of intel_engine_cs structure. (Chris)
v6:
- Rebase.
v7:
- Optimize the for_each_engine_masked() macro. (Chris)
- Change the type of 'iter' local variable to enum intel_engine_id. (Chris)
- Rebase.
v8: Rebase.
v9: Rebase.
v10:
- For index calculation use engine ID instead of pointer based arithmetic in
intel_engine_sync_index() as engine pointers are not contiguous now (Chris)
- For appropriateness, rename local enum variable 'iter' to 'id'. (Joonas)
- Use for_each_engine macro for cleanup in intel_engines_init() and remove
check for NULL engine pointer in cleanup() routines. (Joonas)
v11: Rebase.
Cc: Chris Wilson <chris@chris-wilson.co.uk>
Signed-off-by: Akash Goel <akash.goel@intel.com>
Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: http://patchwork.freedesktop.org/patch/msgid/1476378888-7372-1-git-send-email-akash.goel@intel.com
2016-10-14 01:14:48 +08:00
|
|
|
if (!engine)
|
2014-01-27 21:52:34 +08:00
|
|
|
continue;
|
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->engine_id = i;
|
2014-01-27 21:52:34 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
error_record_engine_registers(error, engine, ee);
|
|
|
|
error_record_engine_waiters(engine, ee);
|
2016-10-13 18:18:14 +08:00
|
|
|
error_record_engine_execlists(engine, ee);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-03-16 19:00:36 +08:00
|
|
|
request = i915_gem_find_active_request(engine);
|
2014-02-25 23:11:24 +08:00
|
|
|
if (request) {
|
2016-08-03 05:50:21 +08:00
|
|
|
struct intel_ring *ring;
|
2014-08-06 21:04:53 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
ee->vm = request->ctx->ppgtt ?
|
2016-07-04 15:08:39 +08:00
|
|
|
&request->ctx->ppgtt->base : &ggtt->base;
|
2014-08-06 21:04:53 +08:00
|
|
|
|
2017-01-29 17:24:33 +08:00
|
|
|
record_context(&ee->context, request->ctx);
|
|
|
|
|
2014-02-25 23:11:24 +08:00
|
|
|
/* We need to copy these to an anonymous buffer
|
|
|
|
* as the simplest method to avoid being overwritten
|
|
|
|
* by userspace.
|
|
|
|
*/
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->batchbuffer =
|
2014-02-25 23:11:24 +08:00
|
|
|
i915_error_object_create(dev_priv,
|
2016-08-15 17:49:06 +08:00
|
|
|
request->batch);
|
2014-02-25 23:11:24 +08:00
|
|
|
|
2016-04-07 16:08:05 +08:00
|
|
|
if (HAS_BROKEN_CS_TLB(dev_priv))
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->wa_batchbuffer =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv,
|
|
|
|
engine->scratch);
|
2017-04-15 17:39:02 +08:00
|
|
|
request_record_user_bo(request, ee);
|
2014-02-25 23:11:24 +08:00
|
|
|
|
2016-08-15 17:49:06 +08:00
|
|
|
ee->ctx =
|
|
|
|
i915_error_object_create(dev_priv,
|
|
|
|
request->ctx->engine[i].state);
|
2016-08-15 17:48:42 +08:00
|
|
|
|
2016-07-04 15:08:39 +08:00
|
|
|
error->simulated |=
|
2016-12-31 19:20:11 +08:00
|
|
|
i915_gem_context_no_error_capture(request->ctx);
|
2016-07-04 15:08:39 +08:00
|
|
|
|
2016-10-05 04:11:30 +08:00
|
|
|
ee->rq_head = request->head;
|
|
|
|
ee->rq_post = request->postfix;
|
|
|
|
ee->rq_tail = request->tail;
|
|
|
|
|
2016-08-03 05:50:19 +08:00
|
|
|
ring = request->ring;
|
|
|
|
ee->cpu_ring_head = ring->head;
|
|
|
|
ee->cpu_ring_tail = ring->tail;
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->ringbuffer =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv, ring->vma);
|
2016-08-15 17:49:10 +08:00
|
|
|
|
|
|
|
engine_record_requests(engine, request, ee);
|
2016-07-04 15:08:38 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ee->hws_page =
|
2016-08-15 17:49:06 +08:00
|
|
|
i915_error_object_create(dev_priv,
|
|
|
|
engine->status_page.vma);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:49:06 +08:00
|
|
|
ee->wa_ctx =
|
|
|
|
i915_error_object_create(dev_priv, engine->wa_ctx.vma);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-08-01 08:00:15 +08:00
|
|
|
static void i915_gem_capture_vm(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error,
|
2013-08-01 08:00:15 +08:00
|
|
|
struct i915_address_space *vm,
|
2016-08-15 17:48:41 +08:00
|
|
|
int idx)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2016-08-15 17:48:41 +08:00
|
|
|
struct drm_i915_error_buffer *active_bo;
|
2013-08-01 08:00:15 +08:00
|
|
|
struct i915_vma *vma;
|
2016-08-15 17:48:41 +08:00
|
|
|
int count;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
count = 0;
|
2016-02-26 19:03:19 +08:00
|
|
|
list_for_each_entry(vma, &vm->active_list, vm_link)
|
2016-08-15 17:48:41 +08:00
|
|
|
count++;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
active_bo = NULL;
|
|
|
|
if (count)
|
|
|
|
active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
|
2013-08-01 08:00:15 +08:00
|
|
|
if (active_bo)
|
2016-08-15 17:48:41 +08:00
|
|
|
count = capture_error_bo(active_bo, count, &vm->active_list, false);
|
|
|
|
else
|
|
|
|
count = 0;
|
|
|
|
|
|
|
|
error->active_vm[idx] = vm;
|
|
|
|
error->active_bo[idx] = active_bo;
|
|
|
|
error->active_bo_count[idx] = count;
|
2013-08-01 08:00:15 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
static void i915_capture_active_buffers(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2013-08-01 08:00:15 +08:00
|
|
|
{
|
2016-08-15 17:48:41 +08:00
|
|
|
int cnt = 0, i, j;
|
|
|
|
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
|
|
|
|
BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
|
|
|
|
|
|
|
|
/* Scan each engine looking for unique active contexts/vm */
|
|
|
|
for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
|
|
|
|
struct drm_i915_error_engine *ee = &error->engine[i];
|
|
|
|
bool found;
|
|
|
|
|
|
|
|
if (!ee->vm)
|
|
|
|
continue;
|
2014-08-13 03:05:47 +08:00
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
found = false;
|
|
|
|
for (j = 0; j < i && !found; j++)
|
|
|
|
found = error->engine[j].vm == ee->vm;
|
|
|
|
if (!found)
|
|
|
|
i915_gem_capture_vm(dev_priv, error, ee->vm, cnt++);
|
2014-08-13 03:05:47 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2016-08-15 17:48:41 +08:00
|
|
|
static void i915_capture_pinned_buffers(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2016-08-15 17:48:41 +08:00
|
|
|
{
|
|
|
|
struct i915_address_space *vm = &dev_priv->ggtt.base;
|
|
|
|
struct drm_i915_error_buffer *bo;
|
|
|
|
struct i915_vma *vma;
|
|
|
|
int count_inactive, count_active;
|
|
|
|
|
|
|
|
count_inactive = 0;
|
|
|
|
list_for_each_entry(vma, &vm->active_list, vm_link)
|
|
|
|
count_inactive++;
|
|
|
|
|
|
|
|
count_active = 0;
|
|
|
|
list_for_each_entry(vma, &vm->inactive_list, vm_link)
|
|
|
|
count_active++;
|
|
|
|
|
|
|
|
bo = NULL;
|
|
|
|
if (count_inactive + count_active)
|
|
|
|
bo = kcalloc(count_inactive + count_active,
|
|
|
|
sizeof(*bo), GFP_ATOMIC);
|
|
|
|
if (!bo)
|
|
|
|
return;
|
|
|
|
|
|
|
|
count_inactive = capture_error_bo(bo, count_inactive,
|
|
|
|
&vm->active_list, true);
|
|
|
|
count_active = capture_error_bo(bo + count_inactive, count_active,
|
|
|
|
&vm->inactive_list, true);
|
|
|
|
error->pinned_bo_count = count_inactive + count_active;
|
|
|
|
error->pinned_bo = bo;
|
|
|
|
}
|
|
|
|
|
2016-10-13 00:24:39 +08:00
|
|
|
static void i915_gem_capture_guc_log_buffer(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2016-10-13 00:24:39 +08:00
|
|
|
{
|
|
|
|
/* Capturing log buf contents won't be useful if logging was disabled */
|
2017-09-20 03:38:44 +08:00
|
|
|
if (!dev_priv->guc.log.vma || (i915_modparams.guc_log_level < 0))
|
2016-10-13 00:24:39 +08:00
|
|
|
return;
|
|
|
|
|
|
|
|
error->guc_log = i915_error_object_create(dev_priv,
|
|
|
|
dev_priv->guc.log.vma);
|
|
|
|
}
|
|
|
|
|
2014-01-30 16:19:35 +08:00
|
|
|
/* Capture all registers which don't fit into another category. */
|
|
|
|
static void i915_capture_reg_state(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2014-08-06 01:07:13 +08:00
|
|
|
int i;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* General organization
|
|
|
|
* 1. Registers specific to a single generation
|
|
|
|
* 2. Registers which belong to multiple generations
|
|
|
|
* 3. Feature specific registers.
|
|
|
|
* 4. Everything else
|
|
|
|
* Please try to follow the order.
|
|
|
|
*/
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* 1: Registers specific to a single generation */
|
2016-10-13 18:03:08 +08:00
|
|
|
if (IS_VALLEYVIEW(dev_priv)) {
|
2014-08-06 01:07:13 +08:00
|
|
|
error->gtier[0] = I915_READ(GTIER);
|
2014-08-02 00:12:27 +08:00
|
|
|
error->ier = I915_READ(VLV_IER);
|
2015-10-22 20:34:57 +08:00
|
|
|
error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
|
2014-01-30 16:19:36 +08:00
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-10-13 18:03:10 +08:00
|
|
|
if (IS_GEN7(dev_priv))
|
2014-01-30 16:19:36 +08:00
|
|
|
error->err_int = I915_READ(GEN7_ERR_INT);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-11-16 16:55:37 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8) {
|
2015-03-24 20:54:19 +08:00
|
|
|
error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
|
|
|
|
error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
|
|
|
|
}
|
|
|
|
|
2016-10-13 18:03:10 +08:00
|
|
|
if (IS_GEN6(dev_priv)) {
|
2015-10-22 20:34:57 +08:00
|
|
|
error->forcewake = I915_READ_FW(FORCEWAKE);
|
2014-01-30 16:19:39 +08:00
|
|
|
error->gab_ctl = I915_READ(GAB_CTL);
|
|
|
|
error->gfx_mode = I915_READ(GFX_MODE);
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* 2: Registers which belong to multiple generations */
|
2016-11-16 16:55:37 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 7)
|
2015-10-22 20:34:57 +08:00
|
|
|
error->forcewake = I915_READ_FW(FORCEWAKE_MT);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2016-11-16 16:55:37 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 6) {
|
2014-01-30 16:19:36 +08:00
|
|
|
error->derrmr = I915_READ(DERRMR);
|
2013-07-12 21:50:57 +08:00
|
|
|
error->error = I915_READ(ERROR_GEN6);
|
|
|
|
error->done_reg = I915_READ(DONE_REG);
|
|
|
|
}
|
|
|
|
|
2017-04-28 15:53:39 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 5)
|
2017-04-28 15:53:37 +08:00
|
|
|
error->ccid = I915_READ(CCID);
|
|
|
|
|
2014-01-30 16:19:36 +08:00
|
|
|
/* 3: Feature specific registers */
|
2016-10-13 18:03:10 +08:00
|
|
|
if (IS_GEN6(dev_priv) || IS_GEN7(dev_priv)) {
|
2014-01-30 16:19:39 +08:00
|
|
|
error->gam_ecochk = I915_READ(GAM_ECOCHK);
|
|
|
|
error->gac_eco = I915_READ(GAC_ECO_BITS);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* 4: Everything else */
|
2016-11-16 16:55:37 +08:00
|
|
|
if (INTEL_GEN(dev_priv) >= 8) {
|
2014-08-06 01:07:13 +08:00
|
|
|
error->ier = I915_READ(GEN8_DE_MISC_IER);
|
|
|
|
for (i = 0; i < 4; i++)
|
|
|
|
error->gtier[i] = I915_READ(GEN8_GT_IER(i));
|
2017-02-15 00:46:11 +08:00
|
|
|
error->ngtier = 4;
|
2016-10-13 18:02:53 +08:00
|
|
|
} else if (HAS_PCH_SPLIT(dev_priv)) {
|
2014-08-02 00:12:27 +08:00
|
|
|
error->ier = I915_READ(DEIER);
|
2014-08-06 01:07:13 +08:00
|
|
|
error->gtier[0] = I915_READ(GTIER);
|
2017-02-15 00:46:11 +08:00
|
|
|
error->ngtier = 1;
|
2016-10-13 18:03:10 +08:00
|
|
|
} else if (IS_GEN2(dev_priv)) {
|
2014-08-02 00:12:27 +08:00
|
|
|
error->ier = I915_READ16(IER);
|
2016-10-13 18:03:08 +08:00
|
|
|
} else if (!IS_VALLEYVIEW(dev_priv)) {
|
2014-08-02 00:12:27 +08:00
|
|
|
error->ier = I915_READ(IER);
|
2014-01-30 16:19:36 +08:00
|
|
|
}
|
|
|
|
error->eir = I915_READ(EIR);
|
|
|
|
error->pgtbl_er = I915_READ(PGTBL_ER);
|
2014-01-30 16:19:35 +08:00
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error,
|
2016-03-19 04:07:55 +08:00
|
|
|
u32 engine_mask,
|
2014-02-25 23:11:26 +08:00
|
|
|
const char *error_msg)
|
2014-02-25 23:11:25 +08:00
|
|
|
{
|
|
|
|
u32 ecode;
|
2016-07-27 16:07:28 +08:00
|
|
|
int engine_id = -1, len;
|
2014-02-25 23:11:25 +08:00
|
|
|
|
2016-07-27 16:07:28 +08:00
|
|
|
ecode = i915_error_generate_code(dev_priv, error, &engine_id);
|
2014-02-25 23:11:25 +08:00
|
|
|
|
2014-02-25 23:11:26 +08:00
|
|
|
len = scnprintf(error->error_msg, sizeof(error->error_msg),
|
2014-11-06 19:03:46 +08:00
|
|
|
"GPU HANG: ecode %d:%d:0x%08x",
|
2016-07-27 16:07:28 +08:00
|
|
|
INTEL_GEN(dev_priv), engine_id, ecode);
|
2014-02-25 23:11:26 +08:00
|
|
|
|
2017-01-29 17:24:33 +08:00
|
|
|
if (engine_id != -1 && error->engine[engine_id].context.pid)
|
2014-02-25 23:11:26 +08:00
|
|
|
len += scnprintf(error->error_msg + len,
|
|
|
|
sizeof(error->error_msg) - len,
|
|
|
|
", in %s [%d]",
|
2017-01-29 17:24:33 +08:00
|
|
|
error->engine[engine_id].context.comm,
|
|
|
|
error->engine[engine_id].context.pid);
|
2014-02-25 23:11:26 +08:00
|
|
|
|
|
|
|
scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
|
|
|
|
", reason: %s, action: %s",
|
|
|
|
error_msg,
|
2016-03-19 04:07:55 +08:00
|
|
|
engine_mask ? "reset" : "continue");
|
2014-02-25 23:11:25 +08:00
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:27 +08:00
|
|
|
static void i915_capture_gen_state(struct drm_i915_private *dev_priv,
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error)
|
2014-02-25 23:11:27 +08:00
|
|
|
{
|
2017-03-02 23:03:56 +08:00
|
|
|
error->awake = dev_priv->gt.awake;
|
2017-03-02 23:15:44 +08:00
|
|
|
error->wakelock = atomic_read(&dev_priv->pm.wakeref_count);
|
|
|
|
error->suspended = dev_priv->pm.suspended;
|
2017-03-02 23:03:56 +08:00
|
|
|
|
2015-08-08 03:24:15 +08:00
|
|
|
error->iommu = -1;
|
|
|
|
#ifdef CONFIG_INTEL_IOMMU
|
|
|
|
error->iommu = intel_iommu_gfx_mapped;
|
|
|
|
#endif
|
2014-02-25 23:11:27 +08:00
|
|
|
error->reset_count = i915_reset_count(&dev_priv->gpu_error);
|
2014-02-25 23:11:28 +08:00
|
|
|
error->suspend_count = dev_priv->suspend_count;
|
2016-08-15 17:48:45 +08:00
|
|
|
|
|
|
|
memcpy(&error->device_info,
|
|
|
|
INTEL_INFO(dev_priv),
|
|
|
|
sizeof(error->device_info));
|
2014-02-25 23:11:27 +08:00
|
|
|
}
|
|
|
|
|
2017-02-22 00:26:19 +08:00
|
|
|
static __always_inline void dup_param(const char *type, void *x)
|
|
|
|
{
|
|
|
|
if (!__builtin_strcmp(type, "char *"))
|
|
|
|
*(void **)x = kstrdup(*(void **)x, GFP_ATOMIC);
|
|
|
|
}
|
|
|
|
|
2016-10-12 17:05:19 +08:00
|
|
|
static int capture(void *data)
|
|
|
|
{
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error = data;
|
2016-10-12 17:05:19 +08:00
|
|
|
|
2017-02-07 05:36:07 +08:00
|
|
|
do_gettimeofday(&error->time);
|
|
|
|
error->boottime = ktime_to_timeval(ktime_get_boottime());
|
|
|
|
error->uptime =
|
|
|
|
ktime_to_timeval(ktime_sub(ktime_get(),
|
|
|
|
error->i915->gt.last_init_time));
|
|
|
|
|
2017-09-20 03:38:44 +08:00
|
|
|
error->params = i915_modparams;
|
2017-02-22 00:26:19 +08:00
|
|
|
#define DUP(T, x) dup_param(#T, &error->params.x);
|
|
|
|
I915_PARAMS_FOR_EACH(DUP);
|
|
|
|
#undef DUP
|
2017-02-07 05:36:07 +08:00
|
|
|
|
2016-10-12 17:05:19 +08:00
|
|
|
i915_capture_gen_state(error->i915, error);
|
|
|
|
i915_capture_reg_state(error->i915, error);
|
|
|
|
i915_gem_record_fences(error->i915, error);
|
|
|
|
i915_gem_record_rings(error->i915, error);
|
|
|
|
i915_capture_active_buffers(error->i915, error);
|
|
|
|
i915_capture_pinned_buffers(error->i915, error);
|
2016-10-13 00:24:39 +08:00
|
|
|
i915_gem_capture_guc_log_buffer(error->i915, error);
|
2016-10-12 17:05:19 +08:00
|
|
|
|
|
|
|
error->overlay = intel_overlay_capture_error_state(error->i915);
|
|
|
|
error->display = intel_display_capture_error_state(error->i915);
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-10-14 21:44:28 +08:00
|
|
|
#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *
|
|
|
|
i915_capture_gpu_state(struct drm_i915_private *i915)
|
|
|
|
{
|
|
|
|
struct i915_gpu_state *error;
|
|
|
|
|
|
|
|
error = kzalloc(sizeof(*error), GFP_ATOMIC);
|
|
|
|
if (!error)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
kref_init(&error->ref);
|
|
|
|
error->i915 = i915;
|
|
|
|
|
|
|
|
stop_machine(capture, error, NULL);
|
|
|
|
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2014-01-30 16:19:35 +08:00
|
|
|
/**
|
|
|
|
* i915_capture_error_state - capture an error record for later analysis
|
|
|
|
* @dev: drm device
|
|
|
|
*
|
|
|
|
* Should be called when an error is detected (either a hang or an error
|
|
|
|
* interrupt) to capture error state from the time of the error. Fills
|
|
|
|
* out a structure which becomes available in debugfs for user level tools
|
|
|
|
* to pick up.
|
|
|
|
*/
|
2016-05-06 22:40:21 +08:00
|
|
|
void i915_capture_error_state(struct drm_i915_private *dev_priv,
|
|
|
|
u32 engine_mask,
|
2014-02-25 23:11:26 +08:00
|
|
|
const char *error_msg)
|
2014-01-30 16:19:35 +08:00
|
|
|
{
|
2014-01-30 22:38:15 +08:00
|
|
|
static bool warned;
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error;
|
2014-01-30 16:19:35 +08:00
|
|
|
unsigned long flags;
|
|
|
|
|
2017-09-20 03:38:44 +08:00
|
|
|
if (!i915_modparams.error_capture)
|
2016-10-12 17:05:18 +08:00
|
|
|
return;
|
|
|
|
|
2016-07-04 15:48:33 +08:00
|
|
|
if (READ_ONCE(dev_priv->gpu_error.first_error))
|
|
|
|
return;
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
error = i915_capture_gpu_state(dev_priv);
|
2014-01-30 16:19:35 +08:00
|
|
|
if (!error) {
|
|
|
|
DRM_DEBUG_DRIVER("out of memory, not capturing error state\n");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-05-06 22:40:21 +08:00
|
|
|
i915_error_capture_msg(dev_priv, error, engine_mask, error_msg);
|
2014-02-25 23:11:25 +08:00
|
|
|
DRM_INFO("%s\n", error->error_msg);
|
|
|
|
|
2016-07-04 15:08:39 +08:00
|
|
|
if (!error->simulated) {
|
|
|
|
spin_lock_irqsave(&dev_priv->gpu_error.lock, flags);
|
|
|
|
if (!dev_priv->gpu_error.first_error) {
|
|
|
|
dev_priv->gpu_error.first_error = error;
|
|
|
|
error = NULL;
|
|
|
|
}
|
|
|
|
spin_unlock_irqrestore(&dev_priv->gpu_error.lock, flags);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2014-02-25 23:11:25 +08:00
|
|
|
if (error) {
|
2017-02-15 00:46:11 +08:00
|
|
|
__i915_gpu_state_free(&error->ref);
|
2014-02-25 23:11:25 +08:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2016-10-14 21:44:28 +08:00
|
|
|
if (!warned &&
|
|
|
|
ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
|
2014-02-25 23:11:25 +08:00
|
|
|
DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
|
|
|
|
DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
|
|
|
|
DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
|
|
|
|
DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
|
2016-07-05 17:40:23 +08:00
|
|
|
DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
|
|
|
|
dev_priv->drm.primary->index);
|
2014-02-25 23:11:25 +08:00
|
|
|
warned = true;
|
|
|
|
}
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *
|
|
|
|
i915_first_error_state(struct drm_i915_private *i915)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
spin_lock_irq(&i915->gpu_error.lock);
|
|
|
|
error = i915->gpu_error.first_error;
|
|
|
|
if (error)
|
|
|
|
i915_gpu_state_get(error);
|
|
|
|
spin_unlock_irq(&i915->gpu_error.lock);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
return error;
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
void i915_reset_error_state(struct drm_i915_private *i915)
|
2013-07-12 21:50:57 +08:00
|
|
|
{
|
2017-02-15 00:46:11 +08:00
|
|
|
struct i915_gpu_state *error;
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
spin_lock_irq(&i915->gpu_error.lock);
|
|
|
|
error = i915->gpu_error.first_error;
|
|
|
|
i915->gpu_error.first_error = NULL;
|
|
|
|
spin_unlock_irq(&i915->gpu_error.lock);
|
2013-07-12 21:50:57 +08:00
|
|
|
|
2017-02-15 00:46:11 +08:00
|
|
|
i915_gpu_state_put(error);
|
2013-07-12 21:50:57 +08:00
|
|
|
}
|