workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE
If a per-cpu work item hogs the CPU, it can prevent other work items from starting through concurrency management. A per-cpu workqueue which intends to host such CPU-hogging work items can choose to not participate in concurrency management by setting %WQ_CPU_INTENSIVE; however, this can be error-prone and difficult to debug when missed. This patch adds an automatic CPU usage based detection. If a concurrency-managed work item consumes more CPU time than the threshold (10ms by default) continuously without intervening sleeps, wq_worker_tick() which is called from scheduler_tick() will detect the condition and automatically mark it CPU_INTENSIVE. The mechanism isn't foolproof: * Detection depends on tick hitting the work item. Getting preempted at the right timings may allow a violating work item to evade detection at least temporarily. * nohz_full CPUs may not be running ticks and thus can fail detection. * Even when detection is working, the 10ms detection delays can add up if many CPU-hogging work items are queued at the same time. However, in vast majority of cases, this should be able to detect violations reliably and provide reasonable protection with a small increase in code complexity. If some work items trigger this condition repeatedly, the bigger problem likely is the CPU being saturated with such per-cpu work items and the solution would be making them UNBOUND. The next patch will add a debug mechanism to help spot such cases. v4: Documentation for workqueue.cpu_intensive_thresh_us added to kernel-parameters.txt. v3: Switch to use wq_worker_tick() instead of hooking into preemptions as suggested by Peter. v2: Lai pointed out that wq_worker_stopping() also needs to be called from preemption and rtlock paths and an earlier patch was updated accordingly. This patch adds a comment describing the risk of infinte recursions and how they're avoided. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Lai Jiangshan <jiangshanlai@gmail.com>
This commit is contained in:
parent
bdf8b9bfc1
commit
616db8779b
|
@ -6931,6 +6931,13 @@
|
||||||
it can be updated at runtime by writing to the
|
it can be updated at runtime by writing to the
|
||||||
corresponding sysfs file.
|
corresponding sysfs file.
|
||||||
|
|
||||||
|
workqueue.cpu_intensive_thresh_us=
|
||||||
|
Per-cpu work items which run for longer than this
|
||||||
|
threshold are automatically considered CPU intensive
|
||||||
|
and excluded from concurrency management to prevent
|
||||||
|
them from noticeably delaying other per-cpu work
|
||||||
|
items. Default is 10000 (10ms).
|
||||||
|
|
||||||
workqueue.disable_numa
|
workqueue.disable_numa
|
||||||
By default, all work items queued to unbound
|
By default, all work items queued to unbound
|
||||||
workqueues are affine to the NUMA nodes they're
|
workqueues are affine to the NUMA nodes they're
|
||||||
|
|
|
@ -354,25 +354,25 @@ Monitoring
|
||||||
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
|
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
|
||||||
|
|
||||||
$ tools/workqueue/wq_monitor.py events
|
$ tools/workqueue/wq_monitor.py events
|
||||||
total infl CMwake mayday rescued
|
total infl CPUitsv CMwake mayday rescued
|
||||||
events 18545 0 5 - -
|
events 18545 0 0 5 - -
|
||||||
events_highpri 8 0 0 - -
|
events_highpri 8 0 0 0 - -
|
||||||
events_long 3 0 0 - -
|
events_long 3 0 0 0 - -
|
||||||
events_unbound 38306 0 - - -
|
events_unbound 38306 0 - - - -
|
||||||
events_freezable 0 0 0 - -
|
events_freezable 0 0 0 0 - -
|
||||||
events_power_efficient 29598 0 0 - -
|
events_power_efficient 29598 0 0 0 - -
|
||||||
events_freezable_power_ 10 0 0 - -
|
events_freezable_power_ 10 0 0 0 - -
|
||||||
sock_diag_events 0 0 0 - -
|
sock_diag_events 0 0 0 0 - -
|
||||||
|
|
||||||
total infl CMwake mayday rescued
|
total infl CPUitsv CMwake mayday rescued
|
||||||
events 18548 0 5 - -
|
events 18548 0 0 5 - -
|
||||||
events_highpri 8 0 0 - -
|
events_highpri 8 0 0 0 - -
|
||||||
events_long 3 0 0 - -
|
events_long 3 0 0 0 - -
|
||||||
events_unbound 38322 0 - - -
|
events_unbound 38322 0 - - - -
|
||||||
events_freezable 0 0 0 - -
|
events_freezable 0 0 0 0 - -
|
||||||
events_power_efficient 29603 0 0 - -
|
events_power_efficient 29603 0 0 0 - -
|
||||||
events_freezable_power_ 10 0 0 - -
|
events_freezable_power_ 10 0 0 0 - -
|
||||||
sock_diag_events 0 0 0 - -
|
sock_diag_events 0 0 0 0 - -
|
||||||
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
|
@ -5632,6 +5632,9 @@ void scheduler_tick(void)
|
||||||
|
|
||||||
perf_event_task_tick();
|
perf_event_task_tick();
|
||||||
|
|
||||||
|
if (curr->flags & PF_WQ_WORKER)
|
||||||
|
wq_worker_tick(curr);
|
||||||
|
|
||||||
#ifdef CONFIG_SMP
|
#ifdef CONFIG_SMP
|
||||||
rq->idle_balance = idle_cpu(cpu);
|
rq->idle_balance = idle_cpu(cpu);
|
||||||
trigger_load_balance(rq);
|
trigger_load_balance(rq);
|
||||||
|
|
|
@ -212,6 +212,7 @@ struct worker_pool {
|
||||||
enum pool_workqueue_stats {
|
enum pool_workqueue_stats {
|
||||||
PWQ_STAT_STARTED, /* work items started execution */
|
PWQ_STAT_STARTED, /* work items started execution */
|
||||||
PWQ_STAT_COMPLETED, /* work items completed execution */
|
PWQ_STAT_COMPLETED, /* work items completed execution */
|
||||||
|
PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
|
||||||
PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
|
PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
|
||||||
PWQ_STAT_MAYDAY, /* maydays to rescuer */
|
PWQ_STAT_MAYDAY, /* maydays to rescuer */
|
||||||
PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
|
PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
|
||||||
|
@ -332,6 +333,14 @@ static struct kmem_cache *pwq_cache;
|
||||||
static cpumask_var_t *wq_numa_possible_cpumask;
|
static cpumask_var_t *wq_numa_possible_cpumask;
|
||||||
/* possible CPUs of each node */
|
/* possible CPUs of each node */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Per-cpu work items which run for longer than the following threshold are
|
||||||
|
* automatically considered CPU intensive and excluded from concurrency
|
||||||
|
* management to prevent them from noticeably delaying other per-cpu work items.
|
||||||
|
*/
|
||||||
|
static unsigned long wq_cpu_intensive_thresh_us = 10000;
|
||||||
|
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
|
||||||
|
|
||||||
static bool wq_disable_numa;
|
static bool wq_disable_numa;
|
||||||
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
|
module_param_named(disable_numa, wq_disable_numa, bool, 0444);
|
||||||
|
|
||||||
|
@ -962,6 +971,13 @@ void wq_worker_running(struct task_struct *task)
|
||||||
if (!(worker->flags & WORKER_NOT_RUNNING))
|
if (!(worker->flags & WORKER_NOT_RUNNING))
|
||||||
worker->pool->nr_running++;
|
worker->pool->nr_running++;
|
||||||
preempt_enable();
|
preempt_enable();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* CPU intensive auto-detection cares about how long a work item hogged
|
||||||
|
* CPU without sleeping. Reset the starting timestamp on wakeup.
|
||||||
|
*/
|
||||||
|
worker->current_at = worker->task->se.sum_exec_runtime;
|
||||||
|
|
||||||
worker->sleeping = 0;
|
worker->sleeping = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1012,6 +1028,45 @@ void wq_worker_sleeping(struct task_struct *task)
|
||||||
raw_spin_unlock_irq(&pool->lock);
|
raw_spin_unlock_irq(&pool->lock);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* wq_worker_tick - a scheduler tick occurred while a kworker is running
|
||||||
|
* @task: task currently running
|
||||||
|
*
|
||||||
|
* Called from scheduler_tick(). We're in the IRQ context and the current
|
||||||
|
* worker's fields which follow the 'K' locking rule can be accessed safely.
|
||||||
|
*/
|
||||||
|
void wq_worker_tick(struct task_struct *task)
|
||||||
|
{
|
||||||
|
struct worker *worker = kthread_data(task);
|
||||||
|
struct pool_workqueue *pwq = worker->current_pwq;
|
||||||
|
struct worker_pool *pool = worker->pool;
|
||||||
|
|
||||||
|
if (!pwq)
|
||||||
|
return;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the current worker is concurrency managed and hogged the CPU for
|
||||||
|
* longer than wq_cpu_intensive_thresh_us, it's automatically marked
|
||||||
|
* CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
|
||||||
|
*/
|
||||||
|
if ((worker->flags & WORKER_NOT_RUNNING) ||
|
||||||
|
worker->task->se.sum_exec_runtime - worker->current_at <
|
||||||
|
wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
|
||||||
|
return;
|
||||||
|
|
||||||
|
raw_spin_lock(&pool->lock);
|
||||||
|
|
||||||
|
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
|
||||||
|
pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
|
||||||
|
|
||||||
|
if (need_more_worker(pool)) {
|
||||||
|
pwq->stats[PWQ_STAT_CM_WAKEUP]++;
|
||||||
|
wake_up_worker(pool);
|
||||||
|
}
|
||||||
|
|
||||||
|
raw_spin_unlock(&pool->lock);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* wq_worker_last_func - retrieve worker's last work function
|
* wq_worker_last_func - retrieve worker's last work function
|
||||||
* @task: Task to retrieve last work function of.
|
* @task: Task to retrieve last work function of.
|
||||||
|
@ -2327,7 +2382,6 @@ __acquires(&pool->lock)
|
||||||
{
|
{
|
||||||
struct pool_workqueue *pwq = get_work_pwq(work);
|
struct pool_workqueue *pwq = get_work_pwq(work);
|
||||||
struct worker_pool *pool = worker->pool;
|
struct worker_pool *pool = worker->pool;
|
||||||
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
|
|
||||||
unsigned long work_data;
|
unsigned long work_data;
|
||||||
struct worker *collision;
|
struct worker *collision;
|
||||||
#ifdef CONFIG_LOCKDEP
|
#ifdef CONFIG_LOCKDEP
|
||||||
|
@ -2364,6 +2418,7 @@ __acquires(&pool->lock)
|
||||||
worker->current_work = work;
|
worker->current_work = work;
|
||||||
worker->current_func = work->func;
|
worker->current_func = work->func;
|
||||||
worker->current_pwq = pwq;
|
worker->current_pwq = pwq;
|
||||||
|
worker->current_at = worker->task->se.sum_exec_runtime;
|
||||||
work_data = *work_data_bits(work);
|
work_data = *work_data_bits(work);
|
||||||
worker->current_color = get_work_color(work_data);
|
worker->current_color = get_work_color(work_data);
|
||||||
|
|
||||||
|
@ -2381,7 +2436,7 @@ __acquires(&pool->lock)
|
||||||
* of concurrency management and the next code block will chain
|
* of concurrency management and the next code block will chain
|
||||||
* execution of the pending work items.
|
* execution of the pending work items.
|
||||||
*/
|
*/
|
||||||
if (unlikely(cpu_intensive))
|
if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
|
||||||
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
|
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -2461,9 +2516,12 @@ __acquires(&pool->lock)
|
||||||
|
|
||||||
raw_spin_lock_irq(&pool->lock);
|
raw_spin_lock_irq(&pool->lock);
|
||||||
|
|
||||||
/* clear cpu intensive status */
|
/*
|
||||||
if (unlikely(cpu_intensive))
|
* In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
|
||||||
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
|
* CPU intensive by wq_worker_tick() if @work hogged CPU longer than
|
||||||
|
* wq_cpu_intensive_thresh_us. Clear it.
|
||||||
|
*/
|
||||||
|
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
|
||||||
|
|
||||||
/* tag the worker for identification in schedule() */
|
/* tag the worker for identification in schedule() */
|
||||||
worker->last_func = worker->current_func;
|
worker->last_func = worker->current_func;
|
||||||
|
|
|
@ -31,6 +31,7 @@ struct worker {
|
||||||
struct work_struct *current_work; /* K: work being processed and its */
|
struct work_struct *current_work; /* K: work being processed and its */
|
||||||
work_func_t current_func; /* K: function */
|
work_func_t current_func; /* K: function */
|
||||||
struct pool_workqueue *current_pwq; /* K: pwq */
|
struct pool_workqueue *current_pwq; /* K: pwq */
|
||||||
|
u64 current_at; /* K: runtime at start or last wakeup */
|
||||||
unsigned int current_color; /* K: color */
|
unsigned int current_color; /* K: color */
|
||||||
|
|
||||||
int sleeping; /* S: is worker sleeping? */
|
int sleeping; /* S: is worker sleeping? */
|
||||||
|
@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
|
||||||
*/
|
*/
|
||||||
void wq_worker_running(struct task_struct *task);
|
void wq_worker_running(struct task_struct *task);
|
||||||
void wq_worker_sleeping(struct task_struct *task);
|
void wq_worker_sleeping(struct task_struct *task);
|
||||||
|
void wq_worker_tick(struct task_struct *task);
|
||||||
work_func_t wq_worker_last_func(struct task_struct *task);
|
work_func_t wq_worker_last_func(struct task_struct *task);
|
||||||
|
|
||||||
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
|
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */
|
||||||
|
|
|
@ -11,6 +11,11 @@ https://github.com/osandov/drgn.
|
||||||
|
|
||||||
infl The number of currently in-flight work items.
|
infl The number of currently in-flight work items.
|
||||||
|
|
||||||
|
CPUitsv The number of times a concurrency-managed work item hogged CPU
|
||||||
|
longer than the threshold (workqueue.cpu_intensive_thresh_us)
|
||||||
|
and got excluded from concurrency management to avoid stalling
|
||||||
|
other work items.
|
||||||
|
|
||||||
CMwake The number of concurrency-management wake-ups while executing a
|
CMwake The number of concurrency-management wake-ups while executing a
|
||||||
work item of the workqueue.
|
work item of the workqueue.
|
||||||
|
|
||||||
|
@ -53,6 +58,7 @@ WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
|
||||||
|
|
||||||
PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
|
PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
|
||||||
PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
|
PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
|
||||||
|
PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
|
||||||
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
|
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
|
||||||
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
|
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
|
||||||
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
|
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
|
||||||
|
@ -75,19 +81,23 @@ class WqStats:
|
||||||
'mem_reclaim' : self.mem_reclaim,
|
'mem_reclaim' : self.mem_reclaim,
|
||||||
'started' : self.stats[PWQ_STAT_STARTED],
|
'started' : self.stats[PWQ_STAT_STARTED],
|
||||||
'completed' : self.stats[PWQ_STAT_COMPLETED],
|
'completed' : self.stats[PWQ_STAT_COMPLETED],
|
||||||
|
'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
|
||||||
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
|
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
|
||||||
'mayday' : self.stats[PWQ_STAT_MAYDAY],
|
'mayday' : self.stats[PWQ_STAT_MAYDAY],
|
||||||
'rescued' : self.stats[PWQ_STAT_RESCUED], }
|
'rescued' : self.stats[PWQ_STAT_RESCUED], }
|
||||||
|
|
||||||
def table_header_str():
|
def table_header_str():
|
||||||
return f'{"":>24} {"total":>8} {"infl":>5} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
|
return f'{"":>24} {"total":>8} {"infl":>5} '\
|
||||||
|
f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
|
||||||
|
|
||||||
def table_row_str(self):
|
def table_row_str(self):
|
||||||
|
cpu_intensive = '-'
|
||||||
cm_wakeup = '-'
|
cm_wakeup = '-'
|
||||||
mayday = '-'
|
mayday = '-'
|
||||||
rescued = '-'
|
rescued = '-'
|
||||||
|
|
||||||
if not self.unbound:
|
if not self.unbound:
|
||||||
|
cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
|
||||||
cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
|
cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
|
||||||
|
|
||||||
if self.mem_reclaim:
|
if self.mem_reclaim:
|
||||||
|
@ -97,6 +107,7 @@ class WqStats:
|
||||||
out = f'{self.name[-24:]:24} ' \
|
out = f'{self.name[-24:]:24} ' \
|
||||||
f'{self.stats[PWQ_STAT_STARTED]:8} ' \
|
f'{self.stats[PWQ_STAT_STARTED]:8} ' \
|
||||||
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
|
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
|
||||||
|
f'{cpu_intensive:>7} ' \
|
||||||
f'{cm_wakeup:>7} ' \
|
f'{cm_wakeup:>7} ' \
|
||||||
f'{mayday:>7} ' \
|
f'{mayday:>7} ' \
|
||||||
f'{rescued:>7} '
|
f'{rescued:>7} '
|
||||||
|
|
Loading…
Reference in New Issue