workqueue: Automatically mark CPU-hogging work items CPU_INTENSIVE

If a per-cpu work item hogs the CPU, it can prevent other work items from
starting through concurrency management. A per-cpu workqueue which intends
to host such CPU-hogging work items can choose to not participate in
concurrency management by setting %WQ_CPU_INTENSIVE; however, this can be
error-prone and difficult to debug when missed.

This patch adds an automatic CPU usage based detection. If a
concurrency-managed work item consumes more CPU time than the threshold
(10ms by default) continuously without intervening sleeps, wq_worker_tick()
which is called from scheduler_tick() will detect the condition and
automatically mark it CPU_INTENSIVE.

The mechanism isn't foolproof:

* Detection depends on tick hitting the work item. Getting preempted at the
  right timings may allow a violating work item to evade detection at least
  temporarily.

* nohz_full CPUs may not be running ticks and thus can fail detection.

* Even when detection is working, the 10ms detection delays can add up if
  many CPU-hogging work items are queued at the same time.

However, in vast majority of cases, this should be able to detect violations
reliably and provide reasonable protection with a small increase in code
complexity.

If some work items trigger this condition repeatedly, the bigger problem
likely is the CPU being saturated with such per-cpu work items and the
solution would be making them UNBOUND. The next patch will add a debug
mechanism to help spot such cases.

v4: Documentation for workqueue.cpu_intensive_thresh_us added to
    kernel-parameters.txt.

v3: Switch to use wq_worker_tick() instead of hooking into preemptions as
    suggested by Peter.

v2: Lai pointed out that wq_worker_stopping() also needs to be called from
    preemption and rtlock paths and an earlier patch was updated
    accordingly. This patch adds a comment describing the risk of infinte
    recursions and how they're avoided.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lai Jiangshan <jiangshanlai@gmail.com>
This commit is contained in:
Tejun Heo 2023-05-17 17:02:08 -10:00
parent bdf8b9bfc1
commit 616db8779b
6 changed files with 105 additions and 24 deletions

View File

@ -6931,6 +6931,13 @@
it can be updated at runtime by writing to the it can be updated at runtime by writing to the
corresponding sysfs file. corresponding sysfs file.
workqueue.cpu_intensive_thresh_us=
Per-cpu work items which run for longer than this
threshold are automatically considered CPU intensive
and excluded from concurrency management to prevent
them from noticeably delaying other per-cpu work
items. Default is 10000 (10ms).
workqueue.disable_numa workqueue.disable_numa
By default, all work items queued to unbound By default, all work items queued to unbound
workqueues are affine to the NUMA nodes they're workqueues are affine to the NUMA nodes they're

View File

@ -354,25 +354,25 @@ Monitoring
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: :: Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
$ tools/workqueue/wq_monitor.py events $ tools/workqueue/wq_monitor.py events
total infl CMwake mayday rescued total infl CPUitsv CMwake mayday rescued
events 18545 0 5 - - events 18545 0 0 5 - -
events_highpri 8 0 0 - - events_highpri 8 0 0 0 - -
events_long 3 0 0 - - events_long 3 0 0 0 - -
events_unbound 38306 0 - - - events_unbound 38306 0 - - - -
events_freezable 0 0 0 - - events_freezable 0 0 0 0 - -
events_power_efficient 29598 0 0 - - events_power_efficient 29598 0 0 0 - -
events_freezable_power_ 10 0 0 - - events_freezable_power_ 10 0 0 0 - -
sock_diag_events 0 0 0 - - sock_diag_events 0 0 0 0 - -
total infl CMwake mayday rescued total infl CPUitsv CMwake mayday rescued
events 18548 0 5 - - events 18548 0 0 5 - -
events_highpri 8 0 0 - - events_highpri 8 0 0 0 - -
events_long 3 0 0 - - events_long 3 0 0 0 - -
events_unbound 38322 0 - - - events_unbound 38322 0 - - - -
events_freezable 0 0 0 - - events_freezable 0 0 0 0 - -
events_power_efficient 29603 0 0 - - events_power_efficient 29603 0 0 0 - -
events_freezable_power_ 10 0 0 - - events_freezable_power_ 10 0 0 0 - -
sock_diag_events 0 0 0 - - sock_diag_events 0 0 0 0 - -
... ...

View File

@ -5632,6 +5632,9 @@ void scheduler_tick(void)
perf_event_task_tick(); perf_event_task_tick();
if (curr->flags & PF_WQ_WORKER)
wq_worker_tick(curr);
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu); rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq); trigger_load_balance(rq);

View File

@ -212,6 +212,7 @@ struct worker_pool {
enum pool_workqueue_stats { enum pool_workqueue_stats {
PWQ_STAT_STARTED, /* work items started execution */ PWQ_STAT_STARTED, /* work items started execution */
PWQ_STAT_COMPLETED, /* work items completed execution */ PWQ_STAT_COMPLETED, /* work items completed execution */
PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_MAYDAY, /* maydays to rescuer */
PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
@ -332,6 +333,14 @@ static struct kmem_cache *pwq_cache;
static cpumask_var_t *wq_numa_possible_cpumask; static cpumask_var_t *wq_numa_possible_cpumask;
/* possible CPUs of each node */ /* possible CPUs of each node */
/*
* Per-cpu work items which run for longer than the following threshold are
* automatically considered CPU intensive and excluded from concurrency
* management to prevent them from noticeably delaying other per-cpu work items.
*/
static unsigned long wq_cpu_intensive_thresh_us = 10000;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
static bool wq_disable_numa; static bool wq_disable_numa;
module_param_named(disable_numa, wq_disable_numa, bool, 0444); module_param_named(disable_numa, wq_disable_numa, bool, 0444);
@ -962,6 +971,13 @@ void wq_worker_running(struct task_struct *task)
if (!(worker->flags & WORKER_NOT_RUNNING)) if (!(worker->flags & WORKER_NOT_RUNNING))
worker->pool->nr_running++; worker->pool->nr_running++;
preempt_enable(); preempt_enable();
/*
* CPU intensive auto-detection cares about how long a work item hogged
* CPU without sleeping. Reset the starting timestamp on wakeup.
*/
worker->current_at = worker->task->se.sum_exec_runtime;
worker->sleeping = 0; worker->sleeping = 0;
} }
@ -1012,6 +1028,45 @@ void wq_worker_sleeping(struct task_struct *task)
raw_spin_unlock_irq(&pool->lock); raw_spin_unlock_irq(&pool->lock);
} }
/**
* wq_worker_tick - a scheduler tick occurred while a kworker is running
* @task: task currently running
*
* Called from scheduler_tick(). We're in the IRQ context and the current
* worker's fields which follow the 'K' locking rule can be accessed safely.
*/
void wq_worker_tick(struct task_struct *task)
{
struct worker *worker = kthread_data(task);
struct pool_workqueue *pwq = worker->current_pwq;
struct worker_pool *pool = worker->pool;
if (!pwq)
return;
/*
* If the current worker is concurrency managed and hogged the CPU for
* longer than wq_cpu_intensive_thresh_us, it's automatically marked
* CPU_INTENSIVE to avoid stalling other concurrency-managed work items.
*/
if ((worker->flags & WORKER_NOT_RUNNING) ||
worker->task->se.sum_exec_runtime - worker->current_at <
wq_cpu_intensive_thresh_us * NSEC_PER_USEC)
return;
raw_spin_lock(&pool->lock);
worker_set_flags(worker, WORKER_CPU_INTENSIVE);
pwq->stats[PWQ_STAT_CPU_INTENSIVE]++;
if (need_more_worker(pool)) {
pwq->stats[PWQ_STAT_CM_WAKEUP]++;
wake_up_worker(pool);
}
raw_spin_unlock(&pool->lock);
}
/** /**
* wq_worker_last_func - retrieve worker's last work function * wq_worker_last_func - retrieve worker's last work function
* @task: Task to retrieve last work function of. * @task: Task to retrieve last work function of.
@ -2327,7 +2382,6 @@ __acquires(&pool->lock)
{ {
struct pool_workqueue *pwq = get_work_pwq(work); struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool; struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
unsigned long work_data; unsigned long work_data;
struct worker *collision; struct worker *collision;
#ifdef CONFIG_LOCKDEP #ifdef CONFIG_LOCKDEP
@ -2364,6 +2418,7 @@ __acquires(&pool->lock)
worker->current_work = work; worker->current_work = work;
worker->current_func = work->func; worker->current_func = work->func;
worker->current_pwq = pwq; worker->current_pwq = pwq;
worker->current_at = worker->task->se.sum_exec_runtime;
work_data = *work_data_bits(work); work_data = *work_data_bits(work);
worker->current_color = get_work_color(work_data); worker->current_color = get_work_color(work_data);
@ -2381,7 +2436,7 @@ __acquires(&pool->lock)
* of concurrency management and the next code block will chain * of concurrency management and the next code block will chain
* execution of the pending work items. * execution of the pending work items.
*/ */
if (unlikely(cpu_intensive)) if (unlikely(pwq->wq->flags & WQ_CPU_INTENSIVE))
worker_set_flags(worker, WORKER_CPU_INTENSIVE); worker_set_flags(worker, WORKER_CPU_INTENSIVE);
/* /*
@ -2461,9 +2516,12 @@ __acquires(&pool->lock)
raw_spin_lock_irq(&pool->lock); raw_spin_lock_irq(&pool->lock);
/* clear cpu intensive status */ /*
if (unlikely(cpu_intensive)) * In addition to %WQ_CPU_INTENSIVE, @worker may also have been marked
worker_clr_flags(worker, WORKER_CPU_INTENSIVE); * CPU intensive by wq_worker_tick() if @work hogged CPU longer than
* wq_cpu_intensive_thresh_us. Clear it.
*/
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
/* tag the worker for identification in schedule() */ /* tag the worker for identification in schedule() */
worker->last_func = worker->current_func; worker->last_func = worker->current_func;

View File

@ -31,6 +31,7 @@ struct worker {
struct work_struct *current_work; /* K: work being processed and its */ struct work_struct *current_work; /* K: work being processed and its */
work_func_t current_func; /* K: function */ work_func_t current_func; /* K: function */
struct pool_workqueue *current_pwq; /* K: pwq */ struct pool_workqueue *current_pwq; /* K: pwq */
u64 current_at; /* K: runtime at start or last wakeup */
unsigned int current_color; /* K: color */ unsigned int current_color; /* K: color */
int sleeping; /* S: is worker sleeping? */ int sleeping; /* S: is worker sleeping? */
@ -76,6 +77,7 @@ static inline struct worker *current_wq_worker(void)
*/ */
void wq_worker_running(struct task_struct *task); void wq_worker_running(struct task_struct *task);
void wq_worker_sleeping(struct task_struct *task); void wq_worker_sleeping(struct task_struct *task);
void wq_worker_tick(struct task_struct *task);
work_func_t wq_worker_last_func(struct task_struct *task); work_func_t wq_worker_last_func(struct task_struct *task);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */

View File

@ -11,6 +11,11 @@ https://github.com/osandov/drgn.
infl The number of currently in-flight work items. infl The number of currently in-flight work items.
CPUitsv The number of times a concurrency-managed work item hogged CPU
longer than the threshold (workqueue.cpu_intensive_thresh_us)
and got excluded from concurrency management to avoid stalling
other work items.
CMwake The number of concurrency-management wake-ups while executing a CMwake The number of concurrency-management wake-ups while executing a
work item of the workqueue. work item of the workqueue.
@ -53,6 +58,7 @@ WQ_MEM_RECLAIM = prog['WQ_MEM_RECLAIM']
PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution PWQ_STAT_STARTED = prog['PWQ_STAT_STARTED'] # work items started execution
PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution PWQ_STAT_COMPLETED = prog['PWQ_STAT_COMPLETED'] # work items completed execution
PWQ_STAT_CPU_INTENSIVE = prog['PWQ_STAT_CPU_INTENSIVE'] # wq_cpu_intensive_thresh_us violations
PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups PWQ_STAT_CM_WAKEUP = prog['PWQ_STAT_CM_WAKEUP'] # concurrency-management worker wakeups
PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer PWQ_STAT_MAYDAY = prog['PWQ_STAT_MAYDAY'] # maydays to rescuer
PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer PWQ_STAT_RESCUED = prog['PWQ_STAT_RESCUED'] # linked work items executed by rescuer
@ -75,19 +81,23 @@ class WqStats:
'mem_reclaim' : self.mem_reclaim, 'mem_reclaim' : self.mem_reclaim,
'started' : self.stats[PWQ_STAT_STARTED], 'started' : self.stats[PWQ_STAT_STARTED],
'completed' : self.stats[PWQ_STAT_COMPLETED], 'completed' : self.stats[PWQ_STAT_COMPLETED],
'cpu_intensive' : self.stats[PWQ_STAT_CPU_INTENSIVE],
'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP], 'cm_wakeup' : self.stats[PWQ_STAT_CM_WAKEUP],
'mayday' : self.stats[PWQ_STAT_MAYDAY], 'mayday' : self.stats[PWQ_STAT_MAYDAY],
'rescued' : self.stats[PWQ_STAT_RESCUED], } 'rescued' : self.stats[PWQ_STAT_RESCUED], }
def table_header_str(): def table_header_str():
return f'{"":>24} {"total":>8} {"infl":>5} {"CMwake":>7} {"mayday":>7} {"rescued":>7}' return f'{"":>24} {"total":>8} {"infl":>5} '\
f'{"CPUitsv":>7} {"CMwake":>7} {"mayday":>7} {"rescued":>7}'
def table_row_str(self): def table_row_str(self):
cpu_intensive = '-'
cm_wakeup = '-' cm_wakeup = '-'
mayday = '-' mayday = '-'
rescued = '-' rescued = '-'
if not self.unbound: if not self.unbound:
cpu_intensive = str(self.stats[PWQ_STAT_CPU_INTENSIVE])
cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP]) cm_wakeup = str(self.stats[PWQ_STAT_CM_WAKEUP])
if self.mem_reclaim: if self.mem_reclaim:
@ -97,6 +107,7 @@ class WqStats:
out = f'{self.name[-24:]:24} ' \ out = f'{self.name[-24:]:24} ' \
f'{self.stats[PWQ_STAT_STARTED]:8} ' \ f'{self.stats[PWQ_STAT_STARTED]:8} ' \
f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \ f'{max(self.stats[PWQ_STAT_STARTED] - self.stats[PWQ_STAT_COMPLETED], 0):5} ' \
f'{cpu_intensive:>7} ' \
f'{cm_wakeup:>7} ' \ f'{cm_wakeup:>7} ' \
f'{mayday:>7} ' \ f'{mayday:>7} ' \
f'{rescued:>7} ' f'{rescued:>7} '