- Correct the marking of kthreads which are supposed to run on a specific,
single CPU vs such which are affine to only one CPU, mark per-cpu workqueue threads as such and make sure that marking "survives" CPU hotplug. Fix CPU hotplug issues with such kthreads. - A fix to not push away tasks on CPUs coming online. - Have workqueue CPU hotplug code use cpu_possible_mask when breaking affinity on CPU offlining so that pending workers can finish on newly arrived onlined CPUs too. - Dump tasks which haven't vacated a CPU which is currently being unplugged. - Register a special scale invariance callback which gets called on resume from RAM to read out APERF/MPERF after resume and thus make the schedutil scaling governor more precise. -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmANYCAACgkQEsHwGGHe VUo+OBAAjfqkijDlXiGX6lrT5gRx5NZICpeMgbWa7J13XHT1ysD/b0fMGFIUyF6k aszDLTl8U/S1/qGAYlzTSPAFcdZ+ENiFqQ48ozMk4jZC3p0quHTjs/PdiSG6kYBi +e4smht+bSyLKxsG8hN0kJ+mLEd+uIQ13kP4YkxPgWbJ9WNP/U6HHGBo0rBchtSe Kn6bdd8CfwmC6rSazp7kdQoFoWeQaoMI1ODX3VphK1GtL1wq8WSICzRhpg3caeyG 3lCIddoNW9mCA9Nkc6R6HeV3uW9JGkPAjnmtTIEHDbg9pib7xNT978ieTQuqNDCi DlAHDGumzoaiVJZhD/1fj/RXMJr2YUHxtrXWNsXpiKJ9g8Tn+WC0UW/4+Mx2L/km 0RSoXJlMs1fGopS2I/fObZ6RPhmg4D+gJsMCdaHQzX4NgxZAGhNNPxMckZ0IM8A0 2NNXSHUZHVTHeJEW0E/glOcpWb5hG+vDwiBMNEWfTwYpTfrw2EEOZaKniZE7WlSL 4ItM9rkLGl1KToJzAH4A0oUtSy3vtSCo8B1noGlc09Lj+oCIBlr81z9+C79a2oxG qE7Xd4X7y7Qs3JeCbRZWQa7/2Kf1v4XnjELrJJeCZC85r0ZqJDwRX8w7lkmW2XPU m4J2prr/DDZSqrRh23/xC1fsU+vcBKSfKUFKAH4Lg2VIaUfSUEk= =2DAF -----END PGP SIGNATURE----- Merge tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler fixes from Borislav Petkov: - Correct the marking of kthreads which are supposed to run on a specific, single CPU vs such which are affine to only one CPU, mark per-cpu workqueue threads as such and make sure that marking "survives" CPU hotplug. Fix CPU hotplug issues with such kthreads. - A fix to not push away tasks on CPUs coming online. - Have workqueue CPU hotplug code use cpu_possible_mask when breaking affinity on CPU offlining so that pending workers can finish on newly arrived onlined CPUs too. - Dump tasks which haven't vacated a CPU which is currently being unplugged. - Register a special scale invariance callback which gets called on resume from RAM to read out APERF/MPERF after resume and thus make the schedutil scaling governor more precise. * tag 'sched_urgent_for_v5.11_rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched: Relax the set_cpus_allowed_ptr() semantics sched: Fix CPU hotplug / tighten is_per_cpu_kthread() sched: Prepare to use balance_push in ttwu() workqueue: Restrict affinity change to rescuer workqueue: Tag bound workers with KTHREAD_IS_PER_CPU kthread: Extract KTHREAD_IS_PER_CPU sched: Don't run cpu-online with balance_push() enabled workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity sched/core: Print out straggler tasks in sched_cpu_dying() x86: PM: Register syscore_ops for scale invariance
This commit is contained in:
commit
24c56ee06c
|
@ -56,6 +56,7 @@
|
|||
#include <linux/numa.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/overflow.h>
|
||||
#include <linux/syscore_ops.h>
|
||||
|
||||
#include <asm/acpi.h>
|
||||
#include <asm/desc.h>
|
||||
|
@ -2083,6 +2084,23 @@ static void init_counter_refs(void)
|
|||
this_cpu_write(arch_prev_mperf, mperf);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PM_SLEEP
|
||||
static struct syscore_ops freq_invariance_syscore_ops = {
|
||||
.resume = init_counter_refs,
|
||||
};
|
||||
|
||||
static void register_freq_invariance_syscore_ops(void)
|
||||
{
|
||||
/* Bail out if registered already. */
|
||||
if (freq_invariance_syscore_ops.node.prev)
|
||||
return;
|
||||
|
||||
register_syscore_ops(&freq_invariance_syscore_ops);
|
||||
}
|
||||
#else
|
||||
static inline void register_freq_invariance_syscore_ops(void) {}
|
||||
#endif
|
||||
|
||||
static void init_freq_invariance(bool secondary, bool cppc_ready)
|
||||
{
|
||||
bool ret = false;
|
||||
|
@ -2109,6 +2127,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
|
|||
if (ret) {
|
||||
init_counter_refs();
|
||||
static_branch_enable(&arch_scale_freq_key);
|
||||
register_freq_invariance_syscore_ops();
|
||||
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
|
||||
} else {
|
||||
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");
|
||||
|
|
|
@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
|
|||
unsigned int cpu,
|
||||
const char *namefmt);
|
||||
|
||||
void kthread_set_per_cpu(struct task_struct *k, int cpu);
|
||||
bool kthread_is_per_cpu(struct task_struct *k);
|
||||
|
||||
/**
|
||||
* kthread_run - create and wake a thread.
|
||||
* @threadfn: the function to run until signal_pending(current).
|
||||
|
|
|
@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
|
|||
return p;
|
||||
kthread_bind(p, cpu);
|
||||
/* CPU hotplug need to bind once again when unparking the thread. */
|
||||
set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags);
|
||||
to_kthread(p)->cpu = cpu;
|
||||
return p;
|
||||
}
|
||||
|
||||
void kthread_set_per_cpu(struct task_struct *k, int cpu)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(k);
|
||||
if (!kthread)
|
||||
return;
|
||||
|
||||
WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
|
||||
|
||||
if (cpu < 0) {
|
||||
clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
|
||||
return;
|
||||
}
|
||||
|
||||
kthread->cpu = cpu;
|
||||
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
|
||||
}
|
||||
|
||||
bool kthread_is_per_cpu(struct task_struct *k)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(k);
|
||||
if (!kthread)
|
||||
return false;
|
||||
|
||||
return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
|
||||
}
|
||||
|
||||
/**
|
||||
* kthread_unpark - unpark a thread created by kthread_create().
|
||||
* @k: thread created by kthread_create().
|
||||
|
|
|
@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
|
|||
*/
|
||||
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
|
||||
{
|
||||
/* When not in the task's cpumask, no point in looking further. */
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
return false;
|
||||
|
||||
if (is_per_cpu_kthread(p) || is_migration_disabled(p))
|
||||
/* migrate_disabled() must be allowed to finish. */
|
||||
if (is_migration_disabled(p))
|
||||
return cpu_online(cpu);
|
||||
|
||||
return cpu_active(cpu);
|
||||
/* Non kernel threads are not allowed during either online or offline. */
|
||||
if (!(p->flags & PF_KTHREAD))
|
||||
return cpu_active(cpu);
|
||||
|
||||
/* KTHREAD_IS_PER_CPU is always allowed. */
|
||||
if (kthread_is_per_cpu(p))
|
||||
return cpu_online(cpu);
|
||||
|
||||
/* Regular kernel threads don't get to stay during offline. */
|
||||
if (cpu_rq(cpu)->balance_push)
|
||||
return false;
|
||||
|
||||
/* But are allowed during online. */
|
||||
return cpu_online(cpu);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2327,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
|
|||
|
||||
if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
|
||||
/*
|
||||
* Kernel threads are allowed on online && !active CPUs.
|
||||
* Kernel threads are allowed on online && !active CPUs,
|
||||
* however, during cpu-hot-unplug, even these might get pushed
|
||||
* away if not KTHREAD_IS_PER_CPU.
|
||||
*
|
||||
* Specifically, migration_disabled() tasks must not fail the
|
||||
* cpumask_any_and_distribute() pick below, esp. so on
|
||||
|
@ -2371,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
|
|||
|
||||
__do_set_cpus_allowed(p, new_mask, flags);
|
||||
|
||||
if (p->flags & PF_KTHREAD) {
|
||||
/*
|
||||
* For kernel threads that do indeed end up on online &&
|
||||
* !active we want to ensure they are strict per-CPU threads.
|
||||
*/
|
||||
WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
|
||||
!cpumask_intersects(new_mask, cpu_active_mask) &&
|
||||
p->nr_cpus_allowed != 1);
|
||||
}
|
||||
|
||||
return affine_move_task(rq, p, &rf, dest_cpu, flags);
|
||||
|
||||
out:
|
||||
|
@ -3121,6 +3128,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
|
|||
|
||||
static inline bool ttwu_queue_cond(int cpu, int wake_flags)
|
||||
{
|
||||
/*
|
||||
* Do not complicate things with the async wake_list while the CPU is
|
||||
* in hotplug state.
|
||||
*/
|
||||
if (!cpu_active(cpu))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If the CPU does not share cache, then queue the task on the
|
||||
* remote rqs wakelist to avoid accessing remote data.
|
||||
|
@ -7276,8 +7290,14 @@ static void balance_push(struct rq *rq)
|
|||
/*
|
||||
* Both the cpu-hotplug and stop task are in this case and are
|
||||
* required to complete the hotplug process.
|
||||
*
|
||||
* XXX: the idle task does not match kthread_is_per_cpu() due to
|
||||
* histerical raisins.
|
||||
*/
|
||||
if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) {
|
||||
if (rq->idle == push_task ||
|
||||
((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
|
||||
is_migration_disabled(push_task)) {
|
||||
|
||||
/*
|
||||
* If this is the idle task on the outgoing CPU try to wake
|
||||
* up the hotplug control thread which might wait for the
|
||||
|
@ -7309,7 +7329,7 @@ static void balance_push(struct rq *rq)
|
|||
/*
|
||||
* At this point need_resched() is true and we'll take the loop in
|
||||
* schedule(). The next pick is obviously going to be the stop task
|
||||
* which is_per_cpu_kthread() and will push this task away.
|
||||
* which kthread_is_per_cpu() and will push this task away.
|
||||
*/
|
||||
raw_spin_lock(&rq->lock);
|
||||
}
|
||||
|
@ -7320,10 +7340,13 @@ static void balance_push_set(int cpu, bool on)
|
|||
struct rq_flags rf;
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
if (on)
|
||||
rq->balance_push = on;
|
||||
if (on) {
|
||||
WARN_ON_ONCE(rq->balance_callback);
|
||||
rq->balance_callback = &balance_push_callback;
|
||||
else
|
||||
} else if (rq->balance_callback == &balance_push_callback) {
|
||||
rq->balance_callback = NULL;
|
||||
}
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
}
|
||||
|
||||
|
@ -7441,6 +7464,10 @@ int sched_cpu_activate(unsigned int cpu)
|
|||
struct rq *rq = cpu_rq(cpu);
|
||||
struct rq_flags rf;
|
||||
|
||||
/*
|
||||
* Make sure that when the hotplug state machine does a roll-back
|
||||
* we clear balance_push. Ideally that would happen earlier...
|
||||
*/
|
||||
balance_push_set(cpu, false);
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
|
@ -7483,17 +7510,27 @@ int sched_cpu_deactivate(unsigned int cpu)
|
|||
int ret;
|
||||
|
||||
set_cpu_active(cpu, false);
|
||||
|
||||
/*
|
||||
* We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
|
||||
* users of this state to go away such that all new such users will
|
||||
* observe it.
|
||||
* From this point forward, this CPU will refuse to run any task that
|
||||
* is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
|
||||
* push those tasks away until this gets cleared, see
|
||||
* sched_cpu_dying().
|
||||
*/
|
||||
balance_push_set(cpu, true);
|
||||
|
||||
/*
|
||||
* We've cleared cpu_active_mask / set balance_push, wait for all
|
||||
* preempt-disabled and RCU users of this state to go away such that
|
||||
* all new such users will observe it.
|
||||
*
|
||||
* Specifically, we rely on ttwu to no longer target this CPU, see
|
||||
* ttwu_queue_cond() and is_cpu_allowed().
|
||||
*
|
||||
* Do sync before park smpboot threads to take care the rcu boost case.
|
||||
*/
|
||||
synchronize_rcu();
|
||||
|
||||
balance_push_set(cpu, true);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
if (rq->rd) {
|
||||
update_rq_clock(rq);
|
||||
|
@ -7574,6 +7611,25 @@ static void calc_load_migrate(struct rq *rq)
|
|||
atomic_long_add(delta, &calc_load_tasks);
|
||||
}
|
||||
|
||||
static void dump_rq_tasks(struct rq *rq, const char *loglvl)
|
||||
{
|
||||
struct task_struct *g, *p;
|
||||
int cpu = cpu_of(rq);
|
||||
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running);
|
||||
for_each_process_thread(g, p) {
|
||||
if (task_cpu(p) != cpu)
|
||||
continue;
|
||||
|
||||
if (!task_on_rq_queued(p))
|
||||
continue;
|
||||
|
||||
printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm);
|
||||
}
|
||||
}
|
||||
|
||||
int sched_cpu_dying(unsigned int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
@ -7583,9 +7639,18 @@ int sched_cpu_dying(unsigned int cpu)
|
|||
sched_tick_stop(cpu);
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq));
|
||||
if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) {
|
||||
WARN(true, "Dying CPU not properly vacated!");
|
||||
dump_rq_tasks(rq, KERN_WARNING);
|
||||
}
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
|
||||
/*
|
||||
* Now that the CPU is offline, make sure we're welcome
|
||||
* to new tasks once we come back up.
|
||||
*/
|
||||
balance_push_set(cpu, false);
|
||||
|
||||
calc_load_migrate(rq);
|
||||
update_max_interval();
|
||||
nohz_balance_exit_idle(rq);
|
||||
|
|
|
@ -975,6 +975,7 @@ struct rq {
|
|||
unsigned long cpu_capacity_orig;
|
||||
|
||||
struct callback_head *balance_callback;
|
||||
unsigned char balance_push;
|
||||
|
||||
unsigned char nohz_idle_balance;
|
||||
unsigned char idle_balance;
|
||||
|
|
|
@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
|
|||
kfree(td);
|
||||
return PTR_ERR(tsk);
|
||||
}
|
||||
kthread_set_per_cpu(tsk, cpu);
|
||||
/*
|
||||
* Park the thread so that it could start right on the CPU
|
||||
* when it is available.
|
||||
|
|
|
@ -1848,12 +1848,6 @@ static void worker_attach_to_pool(struct worker *worker,
|
|||
{
|
||||
mutex_lock(&wq_pool_attach_mutex);
|
||||
|
||||
/*
|
||||
* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
|
||||
* online CPUs. It'll be re-applied when any of the CPUs come up.
|
||||
*/
|
||||
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
|
||||
|
||||
/*
|
||||
* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
|
||||
* stable across this function. See the comments above the flag
|
||||
|
@ -1861,6 +1855,11 @@ static void worker_attach_to_pool(struct worker *worker,
|
|||
*/
|
||||
if (pool->flags & POOL_DISASSOCIATED)
|
||||
worker->flags |= WORKER_UNBOUND;
|
||||
else
|
||||
kthread_set_per_cpu(worker->task, pool->cpu);
|
||||
|
||||
if (worker->rescue_wq)
|
||||
set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
|
||||
|
||||
list_add_tail(&worker->node, &pool->workers);
|
||||
worker->pool = pool;
|
||||
|
@ -1883,6 +1882,7 @@ static void worker_detach_from_pool(struct worker *worker)
|
|||
|
||||
mutex_lock(&wq_pool_attach_mutex);
|
||||
|
||||
kthread_set_per_cpu(worker->task, -1);
|
||||
list_del(&worker->node);
|
||||
worker->pool = NULL;
|
||||
|
||||
|
@ -4919,8 +4919,10 @@ static void unbind_workers(int cpu)
|
|||
|
||||
raw_spin_unlock_irq(&pool->lock);
|
||||
|
||||
for_each_pool_worker(worker, pool)
|
||||
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
|
||||
for_each_pool_worker(worker, pool) {
|
||||
kthread_set_per_cpu(worker->task, -1);
|
||||
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0);
|
||||
}
|
||||
|
||||
mutex_unlock(&wq_pool_attach_mutex);
|
||||
|
||||
|
@ -4972,9 +4974,11 @@ static void rebind_workers(struct worker_pool *pool)
|
|||
* of all workers first and then clear UNBOUND. As we're called
|
||||
* from CPU_ONLINE, the following shouldn't fail.
|
||||
*/
|
||||
for_each_pool_worker(worker, pool)
|
||||
for_each_pool_worker(worker, pool) {
|
||||
kthread_set_per_cpu(worker->task, pool->cpu);
|
||||
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
|
||||
pool->attrs->cpumask) < 0);
|
||||
}
|
||||
|
||||
raw_spin_lock_irq(&pool->lock);
|
||||
|
||||
|
|
Loading…
Reference in New Issue