drm/i915: Apply rps waitboosting for dma_fence_wait_timeout()

As time goes by, usage of generic ioctls such as drm_syncobj and
sync_file are on the increase bypassing i915-specific ioctls like
GEM_WAIT. Currently, we only apply waitboosting to our driver ioctls as
we track the file/client and account the waitboosting to them. However,
since commit 7b92c1bd05 ("drm/i915: Avoid keeping waitboost active for
signaling threads"), we no longer have been applying the client
ratelimiting on waitboosts and so that information has only been used
for debug tracking.

Push the application of waitboosting down to the common
i915_request_wait, and apply it to all foreign fence waits as well.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Eero Tamminen <eero.t.tamminen@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20190213092504.25709-1-chris@chris-wilson.co.uk
This commit is contained in:
Chris Wilson 2019-02-13 09:25:04 +00:00
parent e6ed078d6d
commit 62eb3c24b3
7 changed files with 44 additions and 98 deletions

View File

@ -2019,11 +2019,9 @@ static const char *rps_power_to_str(unsigned int power)
static int i915_rps_boost_info(struct seq_file *m, void *data)
{
struct drm_i915_private *dev_priv = node_to_i915(m->private);
struct drm_device *dev = &dev_priv->drm;
struct intel_rps *rps = &dev_priv->gt_pm.rps;
u32 act_freq = rps->cur_freq;
intel_wakeref_t wakeref;
struct drm_file *file;
with_intel_runtime_pm_if_in_use(dev_priv, wakeref) {
if (IS_VALLEYVIEW(dev_priv) || IS_CHERRYVIEW(dev_priv)) {
@ -2057,22 +2055,7 @@ static int i915_rps_boost_info(struct seq_file *m, void *data)
intel_gpu_freq(dev_priv, rps->efficient_freq),
intel_gpu_freq(dev_priv, rps->boost_freq));
mutex_lock(&dev->filelist_mutex);
list_for_each_entry_reverse(file, &dev->filelist, lhead) {
struct drm_i915_file_private *file_priv = file->driver_priv;
struct task_struct *task;
rcu_read_lock();
task = pid_task(file->pid, PIDTYPE_PID);
seq_printf(m, "%s [%d]: %d boosts\n",
task ? task->comm : "<unknown>",
task ? task->pid : -1,
atomic_read(&file_priv->rps_client.boosts));
rcu_read_unlock();
}
seq_printf(m, "Kernel (anonymous) boosts: %d\n",
atomic_read(&rps->boosts));
mutex_unlock(&dev->filelist_mutex);
seq_printf(m, "Wait boosts: %d\n", atomic_read(&rps->boosts));
if (INTEL_GEN(dev_priv) >= 6 &&
rps->enabled &&

View File

@ -217,10 +217,6 @@ struct drm_i915_file_private {
} mm;
struct idr context_idr;
struct intel_rps_client {
atomic_t boosts;
} rps_client;
unsigned int bsd_engine;
/*
@ -3056,8 +3052,7 @@ void i915_gem_resume(struct drm_i915_private *dev_priv);
vm_fault_t i915_gem_fault(struct vm_fault *vmf);
int i915_gem_object_wait(struct drm_i915_gem_object *obj,
unsigned int flags,
long timeout,
struct intel_rps_client *rps);
long timeout);
int i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
unsigned int flags,
const struct i915_sched_attr *attr);

View File

@ -416,8 +416,7 @@ int i915_gem_object_unbind(struct drm_i915_gem_object *obj)
static long
i915_gem_object_wait_fence(struct dma_fence *fence,
unsigned int flags,
long timeout,
struct intel_rps_client *rps_client)
long timeout)
{
struct i915_request *rq;
@ -435,27 +434,6 @@ i915_gem_object_wait_fence(struct dma_fence *fence,
if (i915_request_completed(rq))
goto out;
/*
* This client is about to stall waiting for the GPU. In many cases
* this is undesirable and limits the throughput of the system, as
* many clients cannot continue processing user input/output whilst
* blocked. RPS autotuning may take tens of milliseconds to respond
* to the GPU load and thus incurs additional latency for the client.
* We can circumvent that by promoting the GPU frequency to maximum
* before we wait. This makes the GPU throttle up much more quickly
* (good for benchmarks and user experience, e.g. window animations),
* but at a cost of spending more power processing the workload
* (bad for battery). Not all clients even want their results
* immediately and for them we should just let the GPU select its own
* frequency to maximise efficiency. To prevent a single client from
* forcing the clocks too high for the whole system, we only allow
* each client to waitboost once in a busy period.
*/
if (rps_client && !i915_request_started(rq)) {
if (INTEL_GEN(rq->i915) >= 6)
gen6_rps_boost(rq, rps_client);
}
timeout = i915_request_wait(rq, flags, timeout);
out:
@ -468,8 +446,7 @@ out:
static long
i915_gem_object_wait_reservation(struct reservation_object *resv,
unsigned int flags,
long timeout,
struct intel_rps_client *rps_client)
long timeout)
{
unsigned int seq = __read_seqcount_begin(&resv->seq);
struct dma_fence *excl;
@ -487,8 +464,7 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
for (i = 0; i < count; i++) {
timeout = i915_gem_object_wait_fence(shared[i],
flags, timeout,
rps_client);
flags, timeout);
if (timeout < 0)
break;
@ -514,8 +490,7 @@ i915_gem_object_wait_reservation(struct reservation_object *resv,
}
if (excl && timeout >= 0)
timeout = i915_gem_object_wait_fence(excl, flags, timeout,
rps_client);
timeout = i915_gem_object_wait_fence(excl, flags, timeout);
dma_fence_put(excl);
@ -609,30 +584,19 @@ i915_gem_object_wait_priority(struct drm_i915_gem_object *obj,
* @obj: i915 gem object
* @flags: how to wait (under a lock, for all rendering or just for writes etc)
* @timeout: how long to wait
* @rps_client: client (user process) to charge for any waitboosting
*/
int
i915_gem_object_wait(struct drm_i915_gem_object *obj,
unsigned int flags,
long timeout,
struct intel_rps_client *rps_client)
long timeout)
{
might_sleep();
GEM_BUG_ON(timeout < 0);
timeout = i915_gem_object_wait_reservation(obj->resv,
flags, timeout,
rps_client);
timeout = i915_gem_object_wait_reservation(obj->resv, flags, timeout);
return timeout < 0 ? timeout : 0;
}
static struct intel_rps_client *to_rps_client(struct drm_file *file)
{
struct drm_i915_file_private *fpriv = file->driver_priv;
return &fpriv->rps_client;
}
static int
i915_gem_phys_pwrite(struct drm_i915_gem_object *obj,
struct drm_i915_gem_pwrite *args,
@ -838,8 +802,7 @@ int i915_gem_obj_prepare_shmem_read(struct drm_i915_gem_object *obj,
ret = i915_gem_object_wait(obj,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED,
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
@ -891,8 +854,7 @@ int i915_gem_obj_prepare_shmem_write(struct drm_i915_gem_object *obj,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED |
I915_WAIT_ALL,
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
@ -1154,8 +1116,7 @@ i915_gem_pread_ioctl(struct drm_device *dev, void *data,
ret = i915_gem_object_wait(obj,
I915_WAIT_INTERRUPTIBLE,
MAX_SCHEDULE_TIMEOUT,
to_rps_client(file));
MAX_SCHEDULE_TIMEOUT);
if (ret)
goto out;
@ -1454,8 +1415,7 @@ i915_gem_pwrite_ioctl(struct drm_device *dev, void *data,
ret = i915_gem_object_wait(obj,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_ALL,
MAX_SCHEDULE_TIMEOUT,
to_rps_client(file));
MAX_SCHEDULE_TIMEOUT);
if (ret)
goto err;
@ -1553,8 +1513,7 @@ i915_gem_set_domain_ioctl(struct drm_device *dev, void *data,
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_PRIORITY |
(write_domain ? I915_WAIT_ALL : 0),
MAX_SCHEDULE_TIMEOUT,
to_rps_client(file));
MAX_SCHEDULE_TIMEOUT);
if (err)
goto out;
@ -1863,8 +1822,7 @@ vm_fault_t i915_gem_fault(struct vm_fault *vmf)
*/
ret = i915_gem_object_wait(obj,
I915_WAIT_INTERRUPTIBLE,
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
goto err;
@ -3195,8 +3153,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_PRIORITY |
I915_WAIT_ALL,
to_wait_timeout(args->timeout_ns),
to_rps_client(file));
to_wait_timeout(args->timeout_ns));
if (args->timeout_ns > 0) {
args->timeout_ns -= ktime_to_ns(ktime_sub(ktime_get(), start));
@ -3265,7 +3222,7 @@ wait_for_timelines(struct drm_i915_private *i915,
* stalls, so allow the gpu to boost to maximum clocks.
*/
if (flags & I915_WAIT_FOR_IDLE_BOOST)
gen6_rps_boost(rq, NULL);
gen6_rps_boost(rq);
timeout = i915_request_wait(rq, flags, timeout);
i915_request_put(rq);
@ -3360,8 +3317,7 @@ i915_gem_object_set_to_wc_domain(struct drm_i915_gem_object *obj, bool write)
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED |
(write ? I915_WAIT_ALL : 0),
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
@ -3423,8 +3379,7 @@ i915_gem_object_set_to_gtt_domain(struct drm_i915_gem_object *obj, bool write)
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED |
(write ? I915_WAIT_ALL : 0),
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
@ -3539,8 +3494,7 @@ restart:
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED |
I915_WAIT_ALL,
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;
@ -3678,8 +3632,7 @@ int i915_gem_set_caching_ioctl(struct drm_device *dev, void *data,
ret = i915_gem_object_wait(obj,
I915_WAIT_INTERRUPTIBLE,
MAX_SCHEDULE_TIMEOUT,
to_rps_client(file));
MAX_SCHEDULE_TIMEOUT);
if (ret)
goto out;
@ -3805,8 +3758,7 @@ i915_gem_object_set_to_cpu_domain(struct drm_i915_gem_object *obj, bool write)
I915_WAIT_INTERRUPTIBLE |
I915_WAIT_LOCKED |
(write ? I915_WAIT_ALL : 0),
MAX_SCHEDULE_TIMEOUT,
NULL);
MAX_SCHEDULE_TIMEOUT);
if (ret)
return ret;

View File

@ -68,7 +68,9 @@ static signed long i915_fence_wait(struct dma_fence *fence,
bool interruptible,
signed long timeout)
{
return i915_request_wait(to_request(fence), interruptible, timeout);
return i915_request_wait(to_request(fence),
interruptible | I915_WAIT_PRIORITY,
timeout);
}
static void i915_fence_release(struct dma_fence *fence)
@ -1136,8 +1138,23 @@ long i915_request_wait(struct i915_request *rq,
if (__i915_spin_request(rq, state, 5))
goto out;
if (flags & I915_WAIT_PRIORITY)
/*
* This client is about to stall waiting for the GPU. In many cases
* this is undesirable and limits the throughput of the system, as
* many clients cannot continue processing user input/output whilst
* blocked. RPS autotuning may take tens of milliseconds to respond
* to the GPU load and thus incurs additional latency for the client.
* We can circumvent that by promoting the GPU frequency to maximum
* before we sleep. This makes the GPU throttle up much more quickly
* (good for benchmarks and user experience, e.g. window animations),
* but at a cost of spending more power processing the workload
* (bad for battery).
*/
if (flags & I915_WAIT_PRIORITY) {
if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
gen6_rps_boost(rq);
i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
}
wait.tsk = current;
if (dma_fence_add_callback(&rq->fence, &wait.cb, request_wait_wake))

View File

@ -13559,7 +13559,7 @@ static int do_rps_boost(struct wait_queue_entry *_wait,
* vblank without our intervention, so leave RPS alone.
*/
if (!i915_request_started(rq))
gen6_rps_boost(rq, NULL);
gen6_rps_boost(rq);
i915_request_put(rq);
drm_crtc_vblank_put(wait->crtc);

View File

@ -2266,7 +2266,7 @@ void intel_suspend_gt_powersave(struct drm_i915_private *dev_priv);
void gen6_rps_busy(struct drm_i915_private *dev_priv);
void gen6_rps_reset_ei(struct drm_i915_private *dev_priv);
void gen6_rps_idle(struct drm_i915_private *dev_priv);
void gen6_rps_boost(struct i915_request *rq, struct intel_rps_client *rps);
void gen6_rps_boost(struct i915_request *rq);
void g4x_wm_get_hw_state(struct drm_i915_private *dev_priv);
void vlv_wm_get_hw_state(struct drm_i915_private *dev_priv);
void ilk_wm_get_hw_state(struct drm_i915_private *dev_priv);

View File

@ -6768,8 +6768,7 @@ void gen6_rps_idle(struct drm_i915_private *dev_priv)
mutex_unlock(&dev_priv->pcu_lock);
}
void gen6_rps_boost(struct i915_request *rq,
struct intel_rps_client *rps_client)
void gen6_rps_boost(struct i915_request *rq)
{
struct intel_rps *rps = &rq->i915->gt_pm.rps;
unsigned long flags;
@ -6798,7 +6797,7 @@ void gen6_rps_boost(struct i915_request *rq,
if (READ_ONCE(rps->cur_freq) < rps->boost_freq)
schedule_work(&rps->work);
atomic_inc(rps_client ? &rps_client->boosts : &rps->boosts);
atomic_inc(&rps->boosts);
}
int intel_set_rps(struct drm_i915_private *dev_priv, u8 val)