Merge branch 'sched/core' into core/mm
Pull the migrate disable mechanics which is a prerequisite for preemptible kmap_local().
This commit is contained in:
commit
13c8da5db4
|
@ -65,21 +65,17 @@ of the SMP domain will span the entire machine, with each group having the
|
|||
cpumask of a node. Or, you could do multi-level NUMA or Opteron, for example,
|
||||
might have just one domain covering its one NUMA level.
|
||||
|
||||
The implementor should read comments in include/linux/sched.h:
|
||||
struct sched_domain fields, SD_FLAG_*, SD_*_INIT to get an idea of
|
||||
the specifics and what to tune.
|
||||
The implementor should read comments in include/linux/sched/sd_flags.h:
|
||||
SD_* to get an idea of the specifics and what to tune for the SD flags
|
||||
of a sched_domain.
|
||||
|
||||
Architectures may retain the regular override the default SD_*_INIT flags
|
||||
while using the generic domain builder in kernel/sched/core.c if they wish to
|
||||
retain the traditional SMT->SMP->NUMA topology (or some subset of that). This
|
||||
can be done by #define'ing ARCH_HASH_SCHED_TUNE.
|
||||
|
||||
Alternatively, the architecture may completely override the generic domain
|
||||
builder by #define'ing ARCH_HASH_SCHED_DOMAIN, and exporting your
|
||||
arch_init_sched_domains function. This function will attach domains to all
|
||||
CPUs using cpu_attach_domain.
|
||||
Architectures may override the generic domain builder and the default SD flags
|
||||
for a given topology level by creating a sched_domain_topology_level array and
|
||||
calling set_sched_topology() with this array as the parameter.
|
||||
|
||||
The sched-domains debugging infrastructure can be enabled by enabling
|
||||
CONFIG_SCHED_DEBUG. This enables an error checking parse of the sched domains
|
||||
which should catch most possible errors (described above). It also prints out
|
||||
the domain structure in a visual format.
|
||||
CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
|
||||
tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
|
||||
knob. This enables an error checking parse of the sched domains which should
|
||||
catch most possible errors (described above). It also prints out the domain
|
||||
structure in a visual format.
|
||||
|
|
|
@ -213,6 +213,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key);
|
|||
|
||||
static int __init init_amu_fie(void)
|
||||
{
|
||||
bool invariance_status = topology_scale_freq_invariant();
|
||||
cpumask_var_t valid_cpus;
|
||||
bool have_policy = false;
|
||||
int ret = 0;
|
||||
|
@ -255,6 +256,15 @@ static int __init init_amu_fie(void)
|
|||
if (!topology_scale_freq_invariant())
|
||||
static_branch_disable(&amu_fie_key);
|
||||
|
||||
/*
|
||||
* Task scheduler behavior depends on frequency invariance support,
|
||||
* either cpufreq or counter driven. If the support status changes as
|
||||
* a result of counter initialisation and use, retrigger the build of
|
||||
* scheduling domains to ensure the information is propagated properly.
|
||||
*/
|
||||
if (invariance_status != topology_scale_freq_invariant())
|
||||
rebuild_sched_domains_energy();
|
||||
|
||||
free_valid_mask:
|
||||
free_cpumask_var(valid_cpus);
|
||||
|
||||
|
|
|
@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m,
|
|||
static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
|
||||
{
|
||||
seq_printf(m, "Cpus_allowed:\t%*pb\n",
|
||||
cpumask_pr_args(task->cpus_ptr));
|
||||
cpumask_pr_args(&task->cpus_mask));
|
||||
seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
|
||||
cpumask_pr_args(task->cpus_ptr));
|
||||
cpumask_pr_args(&task->cpus_mask));
|
||||
}
|
||||
|
||||
static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
|
||||
|
|
|
@ -152,6 +152,7 @@ enum cpuhp_state {
|
|||
CPUHP_AP_ONLINE,
|
||||
CPUHP_TEARDOWN_CPU,
|
||||
CPUHP_AP_ONLINE_IDLE,
|
||||
CPUHP_AP_SCHED_WAIT_EMPTY,
|
||||
CPUHP_AP_SMPBOOT_THREADS,
|
||||
CPUHP_AP_X86_VDSO_VMA_ONLINE,
|
||||
CPUHP_AP_IRQ_AFFINITY_ONLINE,
|
||||
|
|
|
@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p,
|
|||
return cpumask_next_and(-1, src1p, src2p);
|
||||
}
|
||||
|
||||
static inline int cpumask_any_distribute(const struct cpumask *srcp)
|
||||
{
|
||||
return cpumask_first(srcp);
|
||||
}
|
||||
|
||||
#define for_each_cpu(cpu, mask) \
|
||||
for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask)
|
||||
#define for_each_cpu_not(cpu, mask) \
|
||||
|
@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
|
|||
unsigned int cpumask_local_spread(unsigned int i, int node);
|
||||
int cpumask_any_and_distribute(const struct cpumask *src1p,
|
||||
const struct cpumask *src2p);
|
||||
int cpumask_any_distribute(const struct cpumask *srcp);
|
||||
|
||||
/**
|
||||
* for_each_cpu - iterate over every cpu in a mask
|
||||
|
|
|
@ -204,6 +204,7 @@ extern int _cond_resched(void);
|
|||
extern void ___might_sleep(const char *file, int line, int preempt_offset);
|
||||
extern void __might_sleep(const char *file, int line, int preempt_offset);
|
||||
extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
||||
extern void __cant_migrate(const char *file, int line);
|
||||
|
||||
/**
|
||||
* might_sleep - annotation for functions that can sleep
|
||||
|
@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
|||
# define cant_sleep() \
|
||||
do { __cant_sleep(__FILE__, __LINE__, 0); } while (0)
|
||||
# define sched_annotate_sleep() (current->task_state_change = 0)
|
||||
|
||||
/**
|
||||
* cant_migrate - annotation for functions that cannot migrate
|
||||
*
|
||||
* Will print a stack trace if executed in code which is migratable
|
||||
*/
|
||||
# define cant_migrate() \
|
||||
do { \
|
||||
if (IS_ENABLED(CONFIG_SMP)) \
|
||||
__cant_migrate(__FILE__, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
/**
|
||||
* non_block_start - annotate the start of section where sleeping is prohibited
|
||||
*
|
||||
|
@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
|||
int preempt_offset) { }
|
||||
# define might_sleep() do { might_resched(); } while (0)
|
||||
# define cant_sleep() do { } while (0)
|
||||
# define cant_migrate() do { } while (0)
|
||||
# define sched_annotate_sleep() do { } while (0)
|
||||
# define non_block_start() do { } while (0)
|
||||
# define non_block_end() do { } while (0)
|
||||
|
@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset);
|
|||
|
||||
#define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0)
|
||||
|
||||
#ifndef CONFIG_PREEMPT_RT
|
||||
# define cant_migrate() cant_sleep()
|
||||
#else
|
||||
/* Placeholder for now */
|
||||
# define cant_migrate() do { } while (0)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* abs - return absolute value of an argument
|
||||
* @x: the value. If it is unsigned type, it is converted to signed type first.
|
||||
|
|
|
@ -322,34 +322,71 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier,
|
|||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* migrate_disable - Prevent migration of the current task
|
||||
*
|
||||
* Maps to preempt_disable() which also disables preemption. Use
|
||||
* migrate_disable() to annotate that the intent is to prevent migration,
|
||||
* but not necessarily preemption.
|
||||
*
|
||||
* Can be invoked nested like preempt_disable() and needs the corresponding
|
||||
* number of migrate_enable() invocations.
|
||||
*/
|
||||
static __always_inline void migrate_disable(void)
|
||||
{
|
||||
preempt_disable();
|
||||
}
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
/**
|
||||
* migrate_enable - Allow migration of the current task
|
||||
/*
|
||||
* Migrate-Disable and why it is undesired.
|
||||
*
|
||||
* Counterpart to migrate_disable().
|
||||
* When a preempted task becomes elegible to run under the ideal model (IOW it
|
||||
* becomes one of the M highest priority tasks), it might still have to wait
|
||||
* for the preemptee's migrate_disable() section to complete. Thereby suffering
|
||||
* a reduction in bandwidth in the exact duration of the migrate_disable()
|
||||
* section.
|
||||
*
|
||||
* As migrate_disable() can be invoked nested, only the outermost invocation
|
||||
* reenables migration.
|
||||
* Per this argument, the change from preempt_disable() to migrate_disable()
|
||||
* gets us:
|
||||
*
|
||||
* - a higher priority tasks gains reduced wake-up latency; with preempt_disable()
|
||||
* it would have had to wait for the lower priority task.
|
||||
*
|
||||
* - a lower priority tasks; which under preempt_disable() could've instantly
|
||||
* migrated away when another CPU becomes available, is now constrained
|
||||
* by the ability to push the higher priority task away, which might itself be
|
||||
* in a migrate_disable() section, reducing it's available bandwidth.
|
||||
*
|
||||
* IOW it trades latency / moves the interference term, but it stays in the
|
||||
* system, and as long as it remains unbounded, the system is not fully
|
||||
* deterministic.
|
||||
*
|
||||
*
|
||||
* The reason we have it anyway.
|
||||
*
|
||||
* PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a
|
||||
* number of primitives into becoming preemptible, they would also allow
|
||||
* migration. This turns out to break a bunch of per-cpu usage. To this end,
|
||||
* all these primitives employ migirate_disable() to restore this implicit
|
||||
* assumption.
|
||||
*
|
||||
* This is a 'temporary' work-around at best. The correct solution is getting
|
||||
* rid of the above assumptions and reworking the code to employ explicit
|
||||
* per-cpu locking or short preempt-disable regions.
|
||||
*
|
||||
* The end goal must be to get rid of migrate_disable(), alternatively we need
|
||||
* a schedulability theory that does not depend on abritrary migration.
|
||||
*
|
||||
*
|
||||
* Notes on the implementation.
|
||||
*
|
||||
* The implementation is particularly tricky since existing code patterns
|
||||
* dictate neither migrate_disable() nor migrate_enable() is allowed to block.
|
||||
* This means that it cannot use cpus_read_lock() to serialize against hotplug,
|
||||
* nor can it easily migrate itself into a pending affinity mask change on
|
||||
* migrate_enable().
|
||||
*
|
||||
*
|
||||
* Note: even non-work-conserving schedulers like semi-partitioned depends on
|
||||
* migration, so migrate_disable() is not only a problem for
|
||||
* work-conserving schedulers.
|
||||
*
|
||||
* Currently mapped to preempt_enable().
|
||||
*/
|
||||
static __always_inline void migrate_enable(void)
|
||||
{
|
||||
preempt_enable();
|
||||
}
|
||||
extern void migrate_disable(void);
|
||||
extern void migrate_enable(void);
|
||||
|
||||
#else
|
||||
|
||||
static inline void migrate_disable(void) { }
|
||||
static inline void migrate_enable(void) { }
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#endif /* __LINUX_PREEMPT_H */
|
||||
|
|
|
@ -714,6 +714,11 @@ struct task_struct {
|
|||
int nr_cpus_allowed;
|
||||
const cpumask_t *cpus_ptr;
|
||||
cpumask_t cpus_mask;
|
||||
void *migration_pending;
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned short migration_disabled;
|
||||
#endif
|
||||
unsigned short migration_flags;
|
||||
|
||||
#ifdef CONFIG_PREEMPT_RCU
|
||||
int rcu_read_lock_nesting;
|
||||
|
|
|
@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu);
|
|||
extern int sched_cpu_deactivate(unsigned int cpu);
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
extern int sched_cpu_wait_empty(unsigned int cpu);
|
||||
extern int sched_cpu_dying(unsigned int cpu);
|
||||
#else
|
||||
# define sched_cpu_wait_empty NULL
|
||||
# define sched_cpu_dying NULL
|
||||
#endif
|
||||
|
||||
|
|
|
@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
|
|||
|
||||
extern void membarrier_exec_mmap(struct mm_struct *mm);
|
||||
|
||||
extern void membarrier_update_current_mm(struct mm_struct *next_mm);
|
||||
|
||||
#else
|
||||
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
|
||||
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
|
||||
|
@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
|
|||
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _LINUX_SCHED_MM_H */
|
||||
|
|
|
@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu)
|
|||
|
||||
#endif /* !CONFIG_SMP */
|
||||
|
||||
#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
|
||||
extern void rebuild_sched_domains_energy(void);
|
||||
#else
|
||||
static inline void rebuild_sched_domains_energy(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef arch_scale_cpu_capacity
|
||||
/**
|
||||
* arch_scale_cpu_capacity - get the capacity scale factor of a given CPU.
|
||||
|
|
|
@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg);
|
|||
struct cpu_stop_work {
|
||||
struct list_head list; /* cpu_stopper->works */
|
||||
cpu_stop_fn_t fn;
|
||||
unsigned long caller;
|
||||
void *arg;
|
||||
struct cpu_stop_done *done;
|
||||
};
|
||||
|
@ -36,6 +37,8 @@ void stop_machine_park(int cpu);
|
|||
void stop_machine_unpark(int cpu);
|
||||
void stop_machine_yield(const struct cpumask *cpumask);
|
||||
|
||||
extern void print_stop_info(const char *log_lvl, struct task_struct *task);
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
#include <linux/workqueue.h>
|
||||
|
@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu,
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { }
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
|
|
|
@ -96,6 +96,8 @@ struct sched_param {
|
|||
* on a CPU with a capacity big enough to fit the specified value.
|
||||
* A task with a max utilization value smaller than 1024 is more likely
|
||||
* scheduled on a CPU with no more capacity than the specified value.
|
||||
*
|
||||
* A task utilization boundary can be reset by setting the attribute to -1.
|
||||
*/
|
||||
struct sched_attr {
|
||||
__u32 size;
|
||||
|
|
|
@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
|||
*/
|
||||
static void rebuild_sched_domains_locked(void)
|
||||
{
|
||||
struct cgroup_subsys_state *pos_css;
|
||||
struct sched_domain_attr *attr;
|
||||
cpumask_var_t *doms;
|
||||
struct cpuset *cs;
|
||||
int ndoms;
|
||||
|
||||
lockdep_assert_cpus_held();
|
||||
percpu_rwsem_assert_held(&cpuset_rwsem);
|
||||
|
||||
/*
|
||||
* We have raced with CPU hotplug. Don't do anything to avoid
|
||||
* If we have raced with CPU hotplug, return early to avoid
|
||||
* passing doms with offlined cpu to partition_sched_domains().
|
||||
* Anyways, hotplug work item will rebuild sched domains.
|
||||
* Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
|
||||
*
|
||||
* With no CPUs in any subpartitions, top_cpuset's effective CPUs
|
||||
* should be the same as the active CPUs, so checking only top_cpuset
|
||||
* is enough to detect racing CPU offlines.
|
||||
*/
|
||||
if (!top_cpuset.nr_subparts_cpus &&
|
||||
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
|
||||
return;
|
||||
|
||||
if (top_cpuset.nr_subparts_cpus &&
|
||||
!cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
|
||||
return;
|
||||
/*
|
||||
* With subpartition CPUs, however, the effective CPUs of a partition
|
||||
* root should be only a subset of the active CPUs. Since a CPU in any
|
||||
* partition root could be offlined, all must be checked.
|
||||
*/
|
||||
if (top_cpuset.nr_subparts_cpus) {
|
||||
rcu_read_lock();
|
||||
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
|
||||
if (!is_partition_root(cs)) {
|
||||
pos_css = css_rightmost_descendant(pos_css);
|
||||
continue;
|
||||
}
|
||||
if (!cpumask_subset(cs->effective_cpus,
|
||||
cpu_active_mask)) {
|
||||
rcu_read_unlock();
|
||||
return;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* Generate domain masks and attrs */
|
||||
ndoms = generate_sched_domains(&doms, &attr);
|
||||
|
|
|
@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
|||
.name = "ap:online",
|
||||
},
|
||||
/*
|
||||
* Handled on controll processor until the plugged processor manages
|
||||
* Handled on control processor until the plugged processor manages
|
||||
* this itself.
|
||||
*/
|
||||
[CPUHP_TEARDOWN_CPU] = {
|
||||
|
@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
|
|||
.teardown.single = takedown_cpu,
|
||||
.cant_stop = true,
|
||||
},
|
||||
|
||||
[CPUHP_AP_SCHED_WAIT_EMPTY] = {
|
||||
.name = "sched:waitempty",
|
||||
.startup.single = NULL,
|
||||
.teardown.single = sched_cpu_wait_empty,
|
||||
},
|
||||
|
||||
/* Handle smpboot threads park/unpark */
|
||||
[CPUHP_AP_SMPBOOT_THREADS] = {
|
||||
.name = "smpboot/threads:online",
|
||||
|
|
|
@ -475,10 +475,24 @@ static void exit_mm(void)
|
|||
BUG_ON(mm != current->active_mm);
|
||||
/* more a memory barrier than a real lock */
|
||||
task_lock(current);
|
||||
/*
|
||||
* When a thread stops operating on an address space, the loop
|
||||
* in membarrier_private_expedited() may not observe that
|
||||
* tsk->mm, and the loop in membarrier_global_expedited() may
|
||||
* not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
|
||||
* rq->membarrier_state, so those would not issue an IPI.
|
||||
* Membarrier requires a memory barrier after accessing
|
||||
* user-space memory, before clearing tsk->mm or the
|
||||
* rq->membarrier_state.
|
||||
*/
|
||||
smp_mb__after_spinlock();
|
||||
local_irq_disable();
|
||||
current->mm = NULL;
|
||||
mmap_read_unlock(mm);
|
||||
membarrier_update_current_mm(NULL);
|
||||
enter_lazy_tlb(mm, current);
|
||||
local_irq_enable();
|
||||
task_unlock(current);
|
||||
mmap_read_unlock(mm);
|
||||
mm_update_next_owner(mm);
|
||||
mmput(mm);
|
||||
if (test_thread_flag(TIF_MEMDIE))
|
||||
|
|
|
@ -1248,6 +1248,7 @@ void kthread_use_mm(struct mm_struct *mm)
|
|||
tsk->active_mm = mm;
|
||||
}
|
||||
tsk->mm = mm;
|
||||
membarrier_update_current_mm(mm);
|
||||
switch_mm_irqs_off(active_mm, mm, tsk);
|
||||
local_irq_enable();
|
||||
task_unlock(tsk);
|
||||
|
@ -1255,8 +1256,19 @@ void kthread_use_mm(struct mm_struct *mm)
|
|||
finish_arch_post_lock_switch();
|
||||
#endif
|
||||
|
||||
/*
|
||||
* When a kthread starts operating on an address space, the loop
|
||||
* in membarrier_{private,global}_expedited() may not observe
|
||||
* that tsk->mm, and not issue an IPI. Membarrier requires a
|
||||
* memory barrier after storing to tsk->mm, before accessing
|
||||
* user-space memory. A full memory barrier for membarrier
|
||||
* {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
|
||||
* mmdrop(), or explicitly with smp_mb().
|
||||
*/
|
||||
if (active_mm != mm)
|
||||
mmdrop(active_mm);
|
||||
else
|
||||
smp_mb();
|
||||
|
||||
to_kthread(tsk)->oldfs = force_uaccess_begin();
|
||||
}
|
||||
|
@ -1276,9 +1288,18 @@ void kthread_unuse_mm(struct mm_struct *mm)
|
|||
force_uaccess_end(to_kthread(tsk)->oldfs);
|
||||
|
||||
task_lock(tsk);
|
||||
/*
|
||||
* When a kthread stops operating on an address space, the loop
|
||||
* in membarrier_{private,global}_expedited() may not observe
|
||||
* that tsk->mm, and not issue an IPI. Membarrier requires a
|
||||
* memory barrier after accessing user-space memory, before
|
||||
* clearing tsk->mm.
|
||||
*/
|
||||
smp_mb__after_spinlock();
|
||||
sync_mm_rss(mm);
|
||||
local_irq_disable();
|
||||
tsk->mm = NULL;
|
||||
membarrier_update_current_mm(NULL);
|
||||
/* active_mm is still 'mm' */
|
||||
enter_lazy_tlb(mm, tsk);
|
||||
local_irq_enable();
|
||||
|
|
1112
kernel/sched/core.c
1112
kernel/sched/core.c
File diff suppressed because it is too large
Load Diff
|
@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
|||
const struct sched_dl_entity *dl_se = &p->dl;
|
||||
|
||||
if (later_mask &&
|
||||
cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) {
|
||||
cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) {
|
||||
unsigned long cap, max_cap = 0;
|
||||
int cpu, max_cpu = -1;
|
||||
|
||||
|
@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
|
|||
|
||||
WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
|
||||
|
||||
if (cpumask_test_cpu(best_cpu, p->cpus_ptr) &&
|
||||
if (cpumask_test_cpu(best_cpu, &p->cpus_mask) &&
|
||||
dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
|
||||
if (later_mask)
|
||||
cpumask_set_cpu(best_cpu, later_mask);
|
||||
|
|
|
@ -901,16 +901,9 @@ struct cpufreq_governor *cpufreq_default_governor(void)
|
|||
cpufreq_governor_init(schedutil_gov);
|
||||
|
||||
#ifdef CONFIG_ENERGY_MODEL
|
||||
extern bool sched_energy_update;
|
||||
extern struct mutex sched_energy_mutex;
|
||||
|
||||
static void rebuild_sd_workfn(struct work_struct *work)
|
||||
{
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = true;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = false;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
rebuild_sched_domains_energy();
|
||||
}
|
||||
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
* This code tracks the priority of each CPU so that global migration
|
||||
* decisions are easy to calculate. Each CPU can be in a state as follows:
|
||||
*
|
||||
* (INVALID), IDLE, NORMAL, RT1, ... RT99
|
||||
* (INVALID), NORMAL, RT1, ... RT99, HIGHER
|
||||
*
|
||||
* going from the lowest priority to the highest. CPUs in the INVALID state
|
||||
* are not eligible for routing. The system maintains this state with
|
||||
|
@ -19,24 +19,48 @@
|
|||
* in that class). Therefore a typical application without affinity
|
||||
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
|
||||
* searches). For tasks with affinity restrictions, the algorithm has a
|
||||
* worst case complexity of O(min(102, nr_domcpus)), though the scenario that
|
||||
* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
|
||||
* yields the worst case search is fairly contrived.
|
||||
*/
|
||||
#include "sched.h"
|
||||
|
||||
/* Convert between a 140 based task->prio, and our 102 based cpupri */
|
||||
/*
|
||||
* p->rt_priority p->prio newpri cpupri
|
||||
*
|
||||
* -1 -1 (CPUPRI_INVALID)
|
||||
*
|
||||
* 99 0 (CPUPRI_NORMAL)
|
||||
*
|
||||
* 1 98 98 1
|
||||
* ...
|
||||
* 49 50 50 49
|
||||
* 50 49 49 50
|
||||
* ...
|
||||
* 99 0 0 99
|
||||
*
|
||||
* 100 100 (CPUPRI_HIGHER)
|
||||
*/
|
||||
static int convert_prio(int prio)
|
||||
{
|
||||
int cpupri;
|
||||
|
||||
if (prio == CPUPRI_INVALID)
|
||||
cpupri = CPUPRI_INVALID;
|
||||
else if (prio == MAX_PRIO)
|
||||
cpupri = CPUPRI_IDLE;
|
||||
else if (prio >= MAX_RT_PRIO)
|
||||
cpupri = CPUPRI_NORMAL;
|
||||
else
|
||||
cpupri = MAX_RT_PRIO - prio + 1;
|
||||
switch (prio) {
|
||||
case CPUPRI_INVALID:
|
||||
cpupri = CPUPRI_INVALID; /* -1 */
|
||||
break;
|
||||
|
||||
case 0 ... 98:
|
||||
cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
|
||||
break;
|
||||
|
||||
case MAX_RT_PRIO-1:
|
||||
cpupri = CPUPRI_NORMAL; /* 0 */
|
||||
break;
|
||||
|
||||
case MAX_RT_PRIO:
|
||||
cpupri = CPUPRI_HIGHER; /* 100 */
|
||||
break;
|
||||
}
|
||||
|
||||
return cpupri;
|
||||
}
|
||||
|
@ -73,11 +97,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
|
|||
if (skip)
|
||||
return 0;
|
||||
|
||||
if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids)
|
||||
if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
|
||||
return 0;
|
||||
|
||||
if (lowest_mask) {
|
||||
cpumask_and(lowest_mask, p->cpus_ptr, vec->mask);
|
||||
cpumask_and(lowest_mask, &p->cpus_mask, vec->mask);
|
||||
|
||||
/*
|
||||
* We have to ensure that we have at least one bit
|
||||
|
@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
|
|||
* cpupri_set - update the CPU priority setting
|
||||
* @cp: The cpupri context
|
||||
* @cpu: The target CPU
|
||||
* @newpri: The priority (INVALID-RT99) to assign to this CPU
|
||||
* @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
|
||||
*
|
||||
* Note: Assumes cpu_rq(cpu)->lock is locked
|
||||
*
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
|
||||
#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
|
||||
|
||||
#define CPUPRI_INVALID -1
|
||||
#define CPUPRI_IDLE 0
|
||||
#define CPUPRI_NORMAL 1
|
||||
/* values 2-101 are RT priorities 0-99 */
|
||||
#define CPUPRI_NORMAL 0
|
||||
/* values 1-99 are for RT1-RT99 priorities */
|
||||
#define CPUPRI_HIGHER 100
|
||||
|
||||
struct cpupri_vec {
|
||||
atomic_t count;
|
||||
|
|
|
@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i)
|
|||
return __dl_bw_capacity(i);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool dl_bw_visited(int cpu, u64 gen)
|
||||
{
|
||||
struct root_domain *rd = cpu_rq(cpu)->rd;
|
||||
|
||||
if (rd->visit_gen == gen)
|
||||
return true;
|
||||
|
||||
rd->visit_gen = gen;
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
static inline struct dl_bw *dl_bw_of(int i)
|
||||
{
|
||||
|
@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i)
|
|||
{
|
||||
return SCHED_CAPACITY_SCALE;
|
||||
}
|
||||
|
||||
static inline bool dl_bw_visited(int cpu, u64 gen)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline
|
||||
|
@ -543,7 +559,7 @@ static int push_dl_task(struct rq *rq);
|
|||
|
||||
static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
return dl_task(prev);
|
||||
return rq->online && dl_task(prev);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct callback_head, dl_push_head);
|
||||
|
@ -1378,6 +1394,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
|||
|
||||
if (dl_rq->earliest_dl.curr == 0 ||
|
||||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
|
||||
if (dl_rq->earliest_dl.curr == 0)
|
||||
cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
|
||||
dl_rq->earliest_dl.curr = deadline;
|
||||
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
|
||||
}
|
||||
|
@ -1395,6 +1413,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
|
|||
dl_rq->earliest_dl.curr = 0;
|
||||
dl_rq->earliest_dl.next = 0;
|
||||
cpudl_clear(&rq->rd->cpudl, rq->cpu);
|
||||
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
|
||||
} else {
|
||||
struct rb_node *leftmost = dl_rq->root.rb_leftmost;
|
||||
struct sched_dl_entity *entry;
|
||||
|
@ -1664,13 +1683,13 @@ static void yield_task_dl(struct rq *rq)
|
|||
static int find_later_rq(struct task_struct *task);
|
||||
|
||||
static int
|
||||
select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_dl(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
struct task_struct *curr;
|
||||
bool select_rq;
|
||||
struct rq *rq;
|
||||
|
||||
if (sd_flag != SD_BALANCE_WAKE)
|
||||
if (!(flags & WF_TTWU))
|
||||
goto out;
|
||||
|
||||
rq = cpu_rq(cpu);
|
||||
|
@ -1912,7 +1931,7 @@ static void task_fork_dl(struct task_struct *p)
|
|||
static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
cpumask_test_cpu(cpu, &p->cpus_mask))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
@ -2002,8 +2021,8 @@ static int find_later_rq(struct task_struct *task)
|
|||
return this_cpu;
|
||||
}
|
||||
|
||||
best_cpu = cpumask_first_and(later_mask,
|
||||
sched_domain_span(sd));
|
||||
best_cpu = cpumask_any_and_distribute(later_mask,
|
||||
sched_domain_span(sd));
|
||||
/*
|
||||
* Last chance: if a CPU being in both later_mask
|
||||
* and current sd span is valid, that becomes our
|
||||
|
@ -2025,7 +2044,7 @@ static int find_later_rq(struct task_struct *task)
|
|||
if (this_cpu != -1)
|
||||
return this_cpu;
|
||||
|
||||
cpu = cpumask_any(later_mask);
|
||||
cpu = cpumask_any_distribute(later_mask);
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
|
||||
|
@ -2062,7 +2081,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
|
|||
/* Retry if something changed. */
|
||||
if (double_lock_balance(rq, later_rq)) {
|
||||
if (unlikely(task_rq(task) != rq ||
|
||||
!cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) ||
|
||||
!cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) ||
|
||||
task_running(rq, task) ||
|
||||
!dl_task(task) ||
|
||||
!task_on_rq_queued(task))) {
|
||||
|
@ -2129,6 +2148,9 @@ static int push_dl_task(struct rq *rq)
|
|||
return 0;
|
||||
|
||||
retry:
|
||||
if (is_migration_disabled(next_task))
|
||||
return 0;
|
||||
|
||||
if (WARN_ON(next_task == rq->curr))
|
||||
return 0;
|
||||
|
||||
|
@ -2206,7 +2228,7 @@ static void push_dl_tasks(struct rq *rq)
|
|||
static void pull_dl_task(struct rq *this_rq)
|
||||
{
|
||||
int this_cpu = this_rq->cpu, cpu;
|
||||
struct task_struct *p;
|
||||
struct task_struct *p, *push_task;
|
||||
bool resched = false;
|
||||
struct rq *src_rq;
|
||||
u64 dmin = LONG_MAX;
|
||||
|
@ -2236,6 +2258,7 @@ static void pull_dl_task(struct rq *this_rq)
|
|||
continue;
|
||||
|
||||
/* Might drop this_rq->lock */
|
||||
push_task = NULL;
|
||||
double_lock_balance(this_rq, src_rq);
|
||||
|
||||
/*
|
||||
|
@ -2267,17 +2290,27 @@ static void pull_dl_task(struct rq *this_rq)
|
|||
src_rq->curr->dl.deadline))
|
||||
goto skip;
|
||||
|
||||
resched = true;
|
||||
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
dmin = p->dl.deadline;
|
||||
if (is_migration_disabled(p)) {
|
||||
push_task = get_push_task(src_rq);
|
||||
} else {
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
dmin = p->dl.deadline;
|
||||
resched = true;
|
||||
}
|
||||
|
||||
/* Is there any other task even earlier? */
|
||||
}
|
||||
skip:
|
||||
double_unlock_balance(this_rq, src_rq);
|
||||
|
||||
if (push_task) {
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
|
||||
push_task, &src_rq->push_work);
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (resched)
|
||||
|
@ -2301,7 +2334,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
|
|||
}
|
||||
|
||||
static void set_cpus_allowed_dl(struct task_struct *p,
|
||||
const struct cpumask *new_mask)
|
||||
const struct cpumask *new_mask,
|
||||
u32 flags)
|
||||
{
|
||||
struct root_domain *src_rd;
|
||||
struct rq *rq;
|
||||
|
@ -2330,7 +2364,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
|
|||
raw_spin_unlock(&src_dl_b->lock);
|
||||
}
|
||||
|
||||
set_cpus_allowed_common(p, new_mask);
|
||||
set_cpus_allowed_common(p, new_mask, flags);
|
||||
}
|
||||
|
||||
/* Assumes rq->lock is held */
|
||||
|
@ -2503,8 +2537,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
|
|||
}
|
||||
}
|
||||
|
||||
const struct sched_class dl_sched_class
|
||||
__section("__dl_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(dl) = {
|
||||
|
||||
.enqueue_task = enqueue_task_dl,
|
||||
.dequeue_task = dequeue_task_dl,
|
||||
.yield_task = yield_task_dl,
|
||||
|
@ -2523,6 +2557,7 @@ const struct sched_class dl_sched_class
|
|||
.rq_online = rq_online_dl,
|
||||
.rq_offline = rq_offline_dl,
|
||||
.task_woken = task_woken_dl,
|
||||
.find_lock_rq = find_lock_later_rq,
|
||||
#endif
|
||||
|
||||
.task_tick = task_tick_dl,
|
||||
|
@ -2535,33 +2570,39 @@ const struct sched_class dl_sched_class
|
|||
.update_curr = update_curr_dl,
|
||||
};
|
||||
|
||||
/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
|
||||
static u64 dl_generation;
|
||||
|
||||
int sched_dl_global_validate(void)
|
||||
{
|
||||
u64 runtime = global_rt_runtime();
|
||||
u64 period = global_rt_period();
|
||||
u64 new_bw = to_ratio(period, runtime);
|
||||
u64 gen = ++dl_generation;
|
||||
struct dl_bw *dl_b;
|
||||
int cpu, ret = 0;
|
||||
int cpu, cpus, ret = 0;
|
||||
unsigned long flags;
|
||||
|
||||
/*
|
||||
* Here we want to check the bandwidth not being set to some
|
||||
* value smaller than the currently allocated bandwidth in
|
||||
* any of the root_domains.
|
||||
*
|
||||
* FIXME: Cycling on all the CPUs is overdoing, but simpler than
|
||||
* cycling on root_domains... Discussion on different/better
|
||||
* solutions is welcome!
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
rcu_read_lock_sched();
|
||||
|
||||
if (dl_bw_visited(cpu, gen))
|
||||
goto next;
|
||||
|
||||
dl_b = dl_bw_of(cpu);
|
||||
cpus = dl_bw_cpus(cpu);
|
||||
|
||||
raw_spin_lock_irqsave(&dl_b->lock, flags);
|
||||
if (new_bw < dl_b->total_bw)
|
||||
if (new_bw * cpus < dl_b->total_bw)
|
||||
ret = -EBUSY;
|
||||
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
|
||||
|
||||
next:
|
||||
rcu_read_unlock_sched();
|
||||
|
||||
if (ret)
|
||||
|
@ -2587,6 +2628,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
|
|||
void sched_dl_do_global(void)
|
||||
{
|
||||
u64 new_bw = -1;
|
||||
u64 gen = ++dl_generation;
|
||||
struct dl_bw *dl_b;
|
||||
int cpu;
|
||||
unsigned long flags;
|
||||
|
@ -2597,11 +2639,14 @@ void sched_dl_do_global(void)
|
|||
if (global_rt_runtime() != RUNTIME_INF)
|
||||
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
|
||||
|
||||
/*
|
||||
* FIXME: As above...
|
||||
*/
|
||||
for_each_possible_cpu(cpu) {
|
||||
rcu_read_lock_sched();
|
||||
|
||||
if (dl_bw_visited(cpu, gen)) {
|
||||
rcu_read_unlock_sched();
|
||||
continue;
|
||||
}
|
||||
|
||||
dl_b = dl_bw_of(cpu);
|
||||
|
||||
raw_spin_lock_irqsave(&dl_b->lock, flags);
|
||||
|
|
|
@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
if (!schedstat_enabled())
|
||||
return;
|
||||
|
||||
/*
|
||||
* When the sched_schedstat changes from 0 to 1, some sched se
|
||||
* maybe already in the runqueue, the se->statistics.wait_start
|
||||
* will be 0.So it will let the delta wrong. We need to avoid this
|
||||
* scenario.
|
||||
*/
|
||||
if (unlikely(!schedstat_val(se->statistics.wait_start)))
|
||||
return;
|
||||
|
||||
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
|
||||
|
||||
if (entity_is_task(se)) {
|
||||
|
@ -4779,25 +4788,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
|
|||
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
/* throttled entity or throttle-on-deactivate */
|
||||
if (!se->on_rq)
|
||||
break;
|
||||
goto done;
|
||||
|
||||
if (dequeue) {
|
||||
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
|
||||
} else {
|
||||
update_load_avg(qcfs_rq, se, 0);
|
||||
se_update_runnable(se);
|
||||
}
|
||||
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
|
||||
|
||||
qcfs_rq->h_nr_running -= task_delta;
|
||||
qcfs_rq->idle_h_nr_running -= idle_task_delta;
|
||||
|
||||
if (qcfs_rq->load.weight)
|
||||
dequeue = 0;
|
||||
if (qcfs_rq->load.weight) {
|
||||
/* Avoid re-evaluating load for this entity: */
|
||||
se = parent_entity(se);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!se)
|
||||
sub_nr_running(rq, task_delta);
|
||||
for_each_sched_entity(se) {
|
||||
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
|
||||
/* throttled entity or throttle-on-deactivate */
|
||||
if (!se->on_rq)
|
||||
goto done;
|
||||
|
||||
update_load_avg(qcfs_rq, se, 0);
|
||||
se_update_runnable(se);
|
||||
|
||||
qcfs_rq->h_nr_running -= task_delta;
|
||||
qcfs_rq->idle_h_nr_running -= idle_task_delta;
|
||||
}
|
||||
|
||||
/* At this point se is NULL and we are at root level*/
|
||||
sub_nr_running(rq, task_delta);
|
||||
|
||||
done:
|
||||
/*
|
||||
* Note: distribution will already see us throttled via the
|
||||
* throttled-list. rq->lock protects completion.
|
||||
|
@ -5105,9 +5126,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
|
|||
return;
|
||||
|
||||
distribute_cfs_runtime(cfs_b);
|
||||
|
||||
raw_spin_lock_irqsave(&cfs_b->lock, flags);
|
||||
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -5804,6 +5822,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
|
|||
if (sync && cpu_rq(this_cpu)->nr_running == 1)
|
||||
return this_cpu;
|
||||
|
||||
if (available_idle_cpu(prev_cpu))
|
||||
return prev_cpu;
|
||||
|
||||
return nr_cpumask_bits;
|
||||
}
|
||||
|
||||
|
@ -6663,7 +6684,7 @@ fail:
|
|||
|
||||
/*
|
||||
* select_task_rq_fair: Select target runqueue for the waking task in domains
|
||||
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
|
||||
* that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE,
|
||||
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
|
||||
*
|
||||
* Balances load by selecting the idlest CPU in the idlest group, or under
|
||||
|
@ -6674,15 +6695,17 @@ fail:
|
|||
* preempt must be disabled.
|
||||
*/
|
||||
static int
|
||||
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
|
||||
select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
|
||||
{
|
||||
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
|
||||
struct sched_domain *tmp, *sd = NULL;
|
||||
int cpu = smp_processor_id();
|
||||
int new_cpu = prev_cpu;
|
||||
int want_affine = 0;
|
||||
int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
|
||||
/* SD_flags and WF_flags share the first nibble */
|
||||
int sd_flag = wake_flags & 0xF;
|
||||
|
||||
if (sd_flag & SD_BALANCE_WAKE) {
|
||||
if (wake_flags & WF_TTWU) {
|
||||
record_wakee(p);
|
||||
|
||||
if (sched_energy_enabled()) {
|
||||
|
@ -6719,9 +6742,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|||
if (unlikely(sd)) {
|
||||
/* Slow path */
|
||||
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
||||
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
|
||||
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
|
||||
/* Fast path */
|
||||
|
||||
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
||||
|
||||
if (want_affine)
|
||||
|
@ -10047,6 +10069,10 @@ static inline int find_new_ilb(void)
|
|||
|
||||
for_each_cpu_and(ilb, nohz.idle_cpus_mask,
|
||||
housekeeping_cpumask(HK_FLAG_MISC)) {
|
||||
|
||||
if (ilb == smp_processor_id())
|
||||
continue;
|
||||
|
||||
if (idle_cpu(ilb))
|
||||
return ilb;
|
||||
}
|
||||
|
@ -11158,8 +11184,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
|
|||
/*
|
||||
* All the scheduling class methods:
|
||||
*/
|
||||
const struct sched_class fair_sched_class
|
||||
__section("__fair_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(fair) = {
|
||||
|
||||
.enqueue_task = enqueue_task_fair,
|
||||
.dequeue_task = dequeue_task_fair,
|
||||
.yield_task = yield_task_fair,
|
||||
|
|
|
@ -338,6 +338,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
|
|||
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
|
||||
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
|
||||
WARN_ON_ONCE(!duration_ns);
|
||||
WARN_ON_ONCE(current->mm);
|
||||
|
||||
rcu_sleep_check();
|
||||
preempt_disable();
|
||||
|
@ -375,7 +376,7 @@ void cpu_startup_entry(enum cpuhp_state state)
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
static int
|
||||
select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
return task_cpu(p); /* IDLE tasks as never migrated */
|
||||
}
|
||||
|
@ -457,8 +458,8 @@ static void update_curr_idle(struct rq *rq)
|
|||
/*
|
||||
* Simple, special scheduling class for the per-CPU idle tasks:
|
||||
*/
|
||||
const struct sched_class idle_sched_class
|
||||
__section("__idle_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(idle) = {
|
||||
|
||||
/* no enqueue/yield_task for idle tasks */
|
||||
|
||||
/* dequeue is not valid, we print a debug message there: */
|
||||
|
|
|
@ -6,6 +6,134 @@
|
|||
*/
|
||||
#include "sched.h"
|
||||
|
||||
/*
|
||||
* For documentation purposes, here are some membarrier ordering
|
||||
* scenarios to keep in mind:
|
||||
*
|
||||
* A) Userspace thread execution after IPI vs membarrier's memory
|
||||
* barrier before sending the IPI
|
||||
*
|
||||
* Userspace variables:
|
||||
*
|
||||
* int x = 0, y = 0;
|
||||
*
|
||||
* The memory barrier at the start of membarrier() on CPU0 is necessary in
|
||||
* order to enforce the guarantee that any writes occurring on CPU0 before
|
||||
* the membarrier() is executed will be visible to any code executing on
|
||||
* CPU1 after the IPI-induced memory barrier:
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* x = 1
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* b: send IPI IPI-induced mb
|
||||
* c: smp_mb()
|
||||
* r2 = y
|
||||
* y = 1
|
||||
* barrier()
|
||||
* r1 = x
|
||||
*
|
||||
* BUG_ON(r1 == 0 && r2 == 0)
|
||||
*
|
||||
* The write to y and load from x by CPU1 are unordered by the hardware,
|
||||
* so it's possible to have "r1 = x" reordered before "y = 1" at any
|
||||
* point after (b). If the memory barrier at (a) is omitted, then "x = 1"
|
||||
* can be reordered after (a) (although not after (c)), so we get r1 == 0
|
||||
* and r2 == 0. This violates the guarantee that membarrier() is
|
||||
* supposed by provide.
|
||||
*
|
||||
* The timing of the memory barrier at (a) has to ensure that it executes
|
||||
* before the IPI-induced memory barrier on CPU1.
|
||||
*
|
||||
* B) Userspace thread execution before IPI vs membarrier's memory
|
||||
* barrier after completing the IPI
|
||||
*
|
||||
* Userspace variables:
|
||||
*
|
||||
* int x = 0, y = 0;
|
||||
*
|
||||
* The memory barrier at the end of membarrier() on CPU0 is necessary in
|
||||
* order to enforce the guarantee that any writes occurring on CPU1 before
|
||||
* the membarrier() is executed will be visible to any code executing on
|
||||
* CPU0 after the membarrier():
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* x = 1
|
||||
* barrier()
|
||||
* y = 1
|
||||
* r2 = y
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* b: send IPI IPI-induced mb
|
||||
* c: smp_mb()
|
||||
* r1 = x
|
||||
* BUG_ON(r1 == 0 && r2 == 1)
|
||||
*
|
||||
* The writes to x and y are unordered by the hardware, so it's possible to
|
||||
* have "r2 = 1" even though the write to x doesn't execute until (b). If
|
||||
* the memory barrier at (c) is omitted then "r1 = x" can be reordered
|
||||
* before (b) (although not before (a)), so we get "r1 = 0". This violates
|
||||
* the guarantee that membarrier() is supposed to provide.
|
||||
*
|
||||
* The timing of the memory barrier at (c) has to ensure that it executes
|
||||
* after the IPI-induced memory barrier on CPU1.
|
||||
*
|
||||
* C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* d: switch to kthread (includes mb)
|
||||
* b: read rq->curr->mm == NULL
|
||||
* e: switch to user (includes mb)
|
||||
* c: smp_mb()
|
||||
*
|
||||
* Using the scenario from (A), we can show that (a) needs to be paired
|
||||
* with (e). Using the scenario from (B), we can show that (c) needs to
|
||||
* be paired with (d).
|
||||
*
|
||||
* D) exit_mm vs membarrier
|
||||
*
|
||||
* Two thread groups are created, A and B. Thread group B is created by
|
||||
* issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
|
||||
* Let's assume we have a single thread within each thread group (Thread A
|
||||
* and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* exit_mm():
|
||||
* d: smp_mb()
|
||||
* e: current->mm = NULL
|
||||
* b: read rq->curr->mm == NULL
|
||||
* c: smp_mb()
|
||||
*
|
||||
* Using scenario (B), we can show that (c) needs to be paired with (d).
|
||||
*
|
||||
* E) kthread_{use,unuse}_mm vs membarrier
|
||||
*
|
||||
* CPU0 CPU1
|
||||
*
|
||||
* membarrier():
|
||||
* a: smp_mb()
|
||||
* kthread_unuse_mm()
|
||||
* d: smp_mb()
|
||||
* e: current->mm = NULL
|
||||
* b: read rq->curr->mm == NULL
|
||||
* kthread_use_mm()
|
||||
* f: current->mm = mm
|
||||
* g: smp_mb()
|
||||
* c: smp_mb()
|
||||
*
|
||||
* Using the scenario from (A), we can show that (a) needs to be paired
|
||||
* with (g). Using the scenario from (B), we can show that (c) needs to
|
||||
* be paired with (d).
|
||||
*/
|
||||
|
||||
/*
|
||||
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
|
||||
* except MEMBARRIER_CMD_QUERY.
|
||||
|
@ -76,6 +204,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
|
|||
this_cpu_write(runqueues.membarrier_state, 0);
|
||||
}
|
||||
|
||||
void membarrier_update_current_mm(struct mm_struct *next_mm)
|
||||
{
|
||||
struct rq *rq = this_rq();
|
||||
int membarrier_state = 0;
|
||||
|
||||
if (next_mm)
|
||||
membarrier_state = atomic_read(&next_mm->membarrier_state);
|
||||
if (READ_ONCE(rq->membarrier_state) == membarrier_state)
|
||||
return;
|
||||
WRITE_ONCE(rq->membarrier_state, membarrier_state);
|
||||
}
|
||||
|
||||
static int membarrier_global_expedited(void)
|
||||
{
|
||||
int cpu;
|
||||
|
@ -114,12 +254,11 @@ static int membarrier_global_expedited(void)
|
|||
continue;
|
||||
|
||||
/*
|
||||
* Skip the CPU if it runs a kernel thread. The scheduler
|
||||
* leaves the prior task mm in place as an optimization when
|
||||
* scheduling a kthread.
|
||||
* Skip the CPU if it runs a kernel thread which is not using
|
||||
* a task mm.
|
||||
*/
|
||||
p = rcu_dereference(cpu_rq(cpu)->curr);
|
||||
if (p->flags & PF_KTHREAD)
|
||||
if (!p->mm)
|
||||
continue;
|
||||
|
||||
__cpumask_set_cpu(cpu, tmpmask);
|
||||
|
|
|
@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
|
|||
__set_bit(MAX_RT_PRIO, array->bitmap);
|
||||
|
||||
#if defined CONFIG_SMP
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
||||
rt_rq->highest_prio.next = MAX_RT_PRIO;
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
rt_rq->highest_prio.next = MAX_RT_PRIO-1;
|
||||
rt_rq->rt_nr_migratory = 0;
|
||||
rt_rq->overloaded = 0;
|
||||
plist_head_init(&rt_rq->pushable_tasks);
|
||||
|
@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
|
|||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
rt_rq->rt_nr_boosted = 0;
|
||||
rt_rq->rq = rq;
|
||||
rt_rq->tg = tg;
|
||||
|
@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq);
|
|||
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
/* Try to pull RT tasks here if we lower this rq's prio */
|
||||
return rq->rt.highest_prio.curr > prev->prio;
|
||||
return rq->online && rq->rt.highest_prio.curr > prev->prio;
|
||||
}
|
||||
|
||||
static inline int rt_overloaded(struct rq *rq)
|
||||
|
@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
|
|||
p = plist_first_entry(&rq->rt.pushable_tasks,
|
||||
struct task_struct, pushable_tasks);
|
||||
rq->rt.highest_prio.next = p->prio;
|
||||
} else
|
||||
rq->rt.highest_prio.next = MAX_RT_PRIO;
|
||||
} else {
|
||||
rq->rt.highest_prio.next = MAX_RT_PRIO-1;
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
|
|||
sched_find_first_bit(array->bitmap);
|
||||
}
|
||||
|
||||
} else
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
||||
} else {
|
||||
rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
|
||||
}
|
||||
|
||||
dec_rt_prio_smp(rt_rq, prio, prev_prio);
|
||||
}
|
||||
|
@ -1428,14 +1430,14 @@ static void yield_task_rt(struct rq *rq)
|
|||
static int find_lowest_rq(struct task_struct *task);
|
||||
|
||||
static int
|
||||
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_rt(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
struct task_struct *curr;
|
||||
struct rq *rq;
|
||||
bool test;
|
||||
|
||||
/* For anything but wake ups, just return the task_cpu */
|
||||
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
|
||||
if (!(flags & (WF_TTWU | WF_FORK)))
|
||||
goto out;
|
||||
|
||||
rq = cpu_rq(cpu);
|
||||
|
@ -1658,7 +1660,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
|
|||
static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
|
||||
{
|
||||
if (!task_running(rq, p) &&
|
||||
cpumask_test_cpu(cpu, p->cpus_ptr))
|
||||
cpumask_test_cpu(cpu, &p->cpus_mask))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
|
@ -1752,8 +1754,8 @@ static int find_lowest_rq(struct task_struct *task)
|
|||
return this_cpu;
|
||||
}
|
||||
|
||||
best_cpu = cpumask_first_and(lowest_mask,
|
||||
sched_domain_span(sd));
|
||||
best_cpu = cpumask_any_and_distribute(lowest_mask,
|
||||
sched_domain_span(sd));
|
||||
if (best_cpu < nr_cpu_ids) {
|
||||
rcu_read_unlock();
|
||||
return best_cpu;
|
||||
|
@ -1770,7 +1772,7 @@ static int find_lowest_rq(struct task_struct *task)
|
|||
if (this_cpu != -1)
|
||||
return this_cpu;
|
||||
|
||||
cpu = cpumask_any(lowest_mask);
|
||||
cpu = cpumask_any_distribute(lowest_mask);
|
||||
if (cpu < nr_cpu_ids)
|
||||
return cpu;
|
||||
|
||||
|
@ -1811,7 +1813,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
|
|||
* Also make sure that it wasn't scheduled on its rq.
|
||||
*/
|
||||
if (unlikely(task_rq(task) != rq ||
|
||||
!cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) ||
|
||||
!cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) ||
|
||||
task_running(rq, task) ||
|
||||
!rt_task(task) ||
|
||||
!task_on_rq_queued(task))) {
|
||||
|
@ -1859,7 +1861,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
|
|||
* running task can migrate over to a CPU that is running a task
|
||||
* of lesser priority.
|
||||
*/
|
||||
static int push_rt_task(struct rq *rq)
|
||||
static int push_rt_task(struct rq *rq, bool pull)
|
||||
{
|
||||
struct task_struct *next_task;
|
||||
struct rq *lowest_rq;
|
||||
|
@ -1873,6 +1875,34 @@ static int push_rt_task(struct rq *rq)
|
|||
return 0;
|
||||
|
||||
retry:
|
||||
if (is_migration_disabled(next_task)) {
|
||||
struct task_struct *push_task = NULL;
|
||||
int cpu;
|
||||
|
||||
if (!pull || rq->push_busy)
|
||||
return 0;
|
||||
|
||||
cpu = find_lowest_rq(rq->curr);
|
||||
if (cpu == -1 || cpu == rq->cpu)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Given we found a CPU with lower priority than @next_task,
|
||||
* therefore it should be running. However we cannot migrate it
|
||||
* to this other CPU, instead attempt to push the current
|
||||
* running task on this CPU away.
|
||||
*/
|
||||
push_task = get_push_task(rq);
|
||||
if (push_task) {
|
||||
raw_spin_unlock(&rq->lock);
|
||||
stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
|
||||
push_task, &rq->push_work);
|
||||
raw_spin_lock(&rq->lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (WARN_ON(next_task == rq->curr))
|
||||
return 0;
|
||||
|
||||
|
@ -1927,12 +1957,10 @@ retry:
|
|||
deactivate_task(rq, next_task, 0);
|
||||
set_task_cpu(next_task, lowest_rq->cpu);
|
||||
activate_task(lowest_rq, next_task, 0);
|
||||
resched_curr(lowest_rq);
|
||||
ret = 1;
|
||||
|
||||
resched_curr(lowest_rq);
|
||||
|
||||
double_unlock_balance(rq, lowest_rq);
|
||||
|
||||
out:
|
||||
put_task_struct(next_task);
|
||||
|
||||
|
@ -1942,7 +1970,7 @@ out:
|
|||
static void push_rt_tasks(struct rq *rq)
|
||||
{
|
||||
/* push_rt_task will return true if it moved an RT */
|
||||
while (push_rt_task(rq))
|
||||
while (push_rt_task(rq, false))
|
||||
;
|
||||
}
|
||||
|
||||
|
@ -2095,7 +2123,8 @@ void rto_push_irq_work_func(struct irq_work *work)
|
|||
*/
|
||||
if (has_pushable_tasks(rq)) {
|
||||
raw_spin_lock(&rq->lock);
|
||||
push_rt_tasks(rq);
|
||||
while (push_rt_task(rq, true))
|
||||
;
|
||||
raw_spin_unlock(&rq->lock);
|
||||
}
|
||||
|
||||
|
@ -2120,7 +2149,7 @@ static void pull_rt_task(struct rq *this_rq)
|
|||
{
|
||||
int this_cpu = this_rq->cpu, cpu;
|
||||
bool resched = false;
|
||||
struct task_struct *p;
|
||||
struct task_struct *p, *push_task;
|
||||
struct rq *src_rq;
|
||||
int rt_overload_count = rt_overloaded(this_rq);
|
||||
|
||||
|
@ -2167,6 +2196,7 @@ static void pull_rt_task(struct rq *this_rq)
|
|||
* double_lock_balance, and another CPU could
|
||||
* alter this_rq
|
||||
*/
|
||||
push_task = NULL;
|
||||
double_lock_balance(this_rq, src_rq);
|
||||
|
||||
/*
|
||||
|
@ -2194,11 +2224,14 @@ static void pull_rt_task(struct rq *this_rq)
|
|||
if (p->prio < src_rq->curr->prio)
|
||||
goto skip;
|
||||
|
||||
resched = true;
|
||||
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
if (is_migration_disabled(p)) {
|
||||
push_task = get_push_task(src_rq);
|
||||
} else {
|
||||
deactivate_task(src_rq, p, 0);
|
||||
set_task_cpu(p, this_cpu);
|
||||
activate_task(this_rq, p, 0);
|
||||
resched = true;
|
||||
}
|
||||
/*
|
||||
* We continue with the search, just in
|
||||
* case there's an even higher prio task
|
||||
|
@ -2208,6 +2241,13 @@ static void pull_rt_task(struct rq *this_rq)
|
|||
}
|
||||
skip:
|
||||
double_unlock_balance(this_rq, src_rq);
|
||||
|
||||
if (push_task) {
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
|
||||
push_task, &src_rq->push_work);
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
}
|
||||
}
|
||||
|
||||
if (resched)
|
||||
|
@ -2429,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
|
|||
return 0;
|
||||
}
|
||||
|
||||
const struct sched_class rt_sched_class
|
||||
__section("__rt_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(rt) = {
|
||||
|
||||
.enqueue_task = enqueue_task_rt,
|
||||
.dequeue_task = dequeue_task_rt,
|
||||
.yield_task = yield_task_rt,
|
||||
|
@ -2449,6 +2489,7 @@ const struct sched_class rt_sched_class
|
|||
.rq_offline = rq_offline_rt,
|
||||
.task_woken = task_woken_rt,
|
||||
.switched_from = switched_from_rt,
|
||||
.find_lock_rq = find_lock_lowest_rq,
|
||||
#endif
|
||||
|
||||
.task_tick = task_tick_rt,
|
||||
|
|
|
@ -67,7 +67,6 @@
|
|||
#include <linux/tsacct_kern.h>
|
||||
|
||||
#include <asm/tlb.h>
|
||||
#include <asm-generic/vmlinux.lds.h>
|
||||
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
# include <asm/paravirt.h>
|
||||
|
@ -257,30 +256,6 @@ struct rt_bandwidth {
|
|||
|
||||
void __dl_clear_params(struct task_struct *p);
|
||||
|
||||
/*
|
||||
* To keep the bandwidth of -deadline tasks and groups under control
|
||||
* we need some place where:
|
||||
* - store the maximum -deadline bandwidth of the system (the group);
|
||||
* - cache the fraction of that bandwidth that is currently allocated.
|
||||
*
|
||||
* This is all done in the data structure below. It is similar to the
|
||||
* one used for RT-throttling (rt_bandwidth), with the main difference
|
||||
* that, since here we are only interested in admission control, we
|
||||
* do not decrease any runtime while the group "executes", neither we
|
||||
* need a timer to replenish it.
|
||||
*
|
||||
* With respect to SMP, the bandwidth is given on a per-CPU basis,
|
||||
* meaning that:
|
||||
* - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
|
||||
* - dl_total_bw array contains, in the i-eth element, the currently
|
||||
* allocated bandwidth on the i-eth CPU.
|
||||
* Moreover, groups consume bandwidth on each CPU, while tasks only
|
||||
* consume bandwidth on the CPU they're running on.
|
||||
* Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
|
||||
* that will be shown the next time the proc or cgroup controls will
|
||||
* be red. It on its turn can be changed by writing on its own
|
||||
* control.
|
||||
*/
|
||||
struct dl_bandwidth {
|
||||
raw_spinlock_t dl_runtime_lock;
|
||||
u64 dl_runtime;
|
||||
|
@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void)
|
|||
return sysctl_sched_rt_runtime >= 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* To keep the bandwidth of -deadline tasks under control
|
||||
* we need some place where:
|
||||
* - store the maximum -deadline bandwidth of each cpu;
|
||||
* - cache the fraction of bandwidth that is currently allocated in
|
||||
* each root domain;
|
||||
*
|
||||
* This is all done in the data structure below. It is similar to the
|
||||
* one used for RT-throttling (rt_bandwidth), with the main difference
|
||||
* that, since here we are only interested in admission control, we
|
||||
* do not decrease any runtime while the group "executes", neither we
|
||||
* need a timer to replenish it.
|
||||
*
|
||||
* With respect to SMP, bandwidth is given on a per root domain basis,
|
||||
* meaning that:
|
||||
* - bw (< 100%) is the deadline bandwidth of each CPU;
|
||||
* - total_bw is the currently allocated bandwidth in each root domain;
|
||||
*/
|
||||
struct dl_bw {
|
||||
raw_spinlock_t lock;
|
||||
u64 bw;
|
||||
|
@ -801,6 +794,15 @@ struct root_domain {
|
|||
struct dl_bw dl_bw;
|
||||
struct cpudl cpudl;
|
||||
|
||||
/*
|
||||
* Indicate whether a root_domain's dl_bw has been checked or
|
||||
* updated. It's monotonously increasing value.
|
||||
*
|
||||
* Also, some corner cases, like 'wrap around' is dangerous, but given
|
||||
* that u64 is 'big enough'. So that shouldn't be a concern.
|
||||
*/
|
||||
u64 visit_gen;
|
||||
|
||||
#ifdef HAVE_RT_PUSH_IPI
|
||||
/*
|
||||
* For IPI pull requests, loop across the rto_mask.
|
||||
|
@ -973,6 +975,7 @@ struct rq {
|
|||
unsigned long cpu_capacity_orig;
|
||||
|
||||
struct callback_head *balance_callback;
|
||||
unsigned char balance_flags;
|
||||
|
||||
unsigned char nohz_idle_balance;
|
||||
unsigned char idle_balance;
|
||||
|
@ -1003,6 +1006,10 @@ struct rq {
|
|||
|
||||
/* This is used to determine avg_idle's max value */
|
||||
u64 max_idle_balance_cost;
|
||||
|
||||
#ifdef CONFIG_HOTPLUG_CPU
|
||||
struct rcuwait hotplug_wait;
|
||||
#endif
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
|
@ -1048,6 +1055,12 @@ struct rq {
|
|||
/* Must be inspected within a rcu lock section */
|
||||
struct cpuidle_state *idle_state;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
unsigned int nr_pinned;
|
||||
#endif
|
||||
unsigned int push_busy;
|
||||
struct cpu_stop_work push_work;
|
||||
};
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
@ -1075,6 +1088,16 @@ static inline int cpu_of(struct rq *rq)
|
|||
#endif
|
||||
}
|
||||
|
||||
#define MDF_PUSH 0x01
|
||||
|
||||
static inline bool is_migration_disabled(struct task_struct *p)
|
||||
{
|
||||
#ifdef CONFIG_SMP
|
||||
return p->migration_disabled;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_SMT
|
||||
extern void __update_idle_core(struct rq *rq);
|
||||
|
@ -1221,6 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf)
|
|||
rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
|
||||
rf->clock_update_flags = 0;
|
||||
#endif
|
||||
#ifdef CONFIG_SMP
|
||||
SCHED_WARN_ON(rq->balance_callback);
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf)
|
||||
|
@ -1382,6 +1408,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#define BALANCE_WORK 0x01
|
||||
#define BALANCE_PUSH 0x02
|
||||
|
||||
static inline void
|
||||
queue_balance_callback(struct rq *rq,
|
||||
struct callback_head *head,
|
||||
|
@ -1389,12 +1418,13 @@ queue_balance_callback(struct rq *rq,
|
|||
{
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
if (unlikely(head->next))
|
||||
if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH)))
|
||||
return;
|
||||
|
||||
head->func = (void (*)(struct callback_head *))func;
|
||||
head->next = rq->balance_callback;
|
||||
rq->balance_callback = head;
|
||||
rq->balance_flags |= BALANCE_WORK;
|
||||
}
|
||||
|
||||
#define rcu_dereference_check_sched_domain(p) \
|
||||
|
@ -1714,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p)
|
|||
return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING;
|
||||
}
|
||||
|
||||
/*
|
||||
* wake flags
|
||||
*/
|
||||
#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
|
||||
#define WF_FORK 0x02 /* Child wakeup after fork */
|
||||
#define WF_MIGRATED 0x04 /* Internal use, task got migrated */
|
||||
#define WF_ON_CPU 0x08 /* Wakee is on_cpu */
|
||||
/* Wake flags. The first three directly map to some SD flag value */
|
||||
#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */
|
||||
#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */
|
||||
#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */
|
||||
|
||||
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
|
||||
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
|
||||
#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static_assert(WF_EXEC == SD_BALANCE_EXEC);
|
||||
static_assert(WF_FORK == SD_BALANCE_FORK);
|
||||
static_assert(WF_TTWU == SD_BALANCE_WAKE);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* To aid in avoiding the subversion of "niceness" due to uneven distribution
|
||||
|
@ -1796,16 +1833,19 @@ struct sched_class {
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
|
||||
int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags);
|
||||
void (*migrate_task_rq)(struct task_struct *p, int new_cpu);
|
||||
|
||||
void (*task_woken)(struct rq *this_rq, struct task_struct *task);
|
||||
|
||||
void (*set_cpus_allowed)(struct task_struct *p,
|
||||
const struct cpumask *newmask);
|
||||
const struct cpumask *newmask,
|
||||
u32 flags);
|
||||
|
||||
void (*rq_online)(struct rq *rq);
|
||||
void (*rq_offline)(struct rq *rq);
|
||||
|
||||
struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq);
|
||||
#endif
|
||||
|
||||
void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
|
||||
|
@ -1833,7 +1873,7 @@ struct sched_class {
|
|||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
void (*task_change_group)(struct task_struct *p, int type);
|
||||
#endif
|
||||
} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
|
||||
};
|
||||
|
||||
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
|
||||
{
|
||||
|
@ -1847,6 +1887,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
|
|||
next->sched_class->set_next_task(rq, next, false);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Helper to define a sched_class instance; each one is placed in a separate
|
||||
* section which is ordered by the linker script:
|
||||
*
|
||||
* include/asm-generic/vmlinux.lds.h
|
||||
*
|
||||
* Also enforce alignment on the instance, not the type, to guarantee layout.
|
||||
*/
|
||||
#define DEFINE_SCHED_CLASS(name) \
|
||||
const struct sched_class name##_sched_class \
|
||||
__aligned(__alignof__(struct sched_class)) \
|
||||
__section("__" #name "_sched_class")
|
||||
|
||||
/* Defined in include/asm-generic/vmlinux.lds.h */
|
||||
extern struct sched_class __begin_sched_classes[];
|
||||
extern struct sched_class __end_sched_classes[];
|
||||
|
@ -1889,13 +1943,35 @@ static inline bool sched_fair_runnable(struct rq *rq)
|
|||
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
|
||||
extern struct task_struct *pick_next_task_idle(struct rq *rq);
|
||||
|
||||
#define SCA_CHECK 0x01
|
||||
#define SCA_MIGRATE_DISABLE 0x02
|
||||
#define SCA_MIGRATE_ENABLE 0x04
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
extern void update_group_capacity(struct sched_domain *sd, int cpu);
|
||||
|
||||
extern void trigger_load_balance(struct rq *rq);
|
||||
|
||||
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
|
||||
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
|
||||
|
||||
static inline struct task_struct *get_push_task(struct rq *rq)
|
||||
{
|
||||
struct task_struct *p = rq->curr;
|
||||
|
||||
lockdep_assert_held(&rq->lock);
|
||||
|
||||
if (rq->push_busy)
|
||||
return NULL;
|
||||
|
||||
if (p->nr_cpus_allowed == 1)
|
||||
return NULL;
|
||||
|
||||
rq->push_busy = true;
|
||||
return get_task_struct(p);
|
||||
}
|
||||
|
||||
extern int push_cpu_stop(void *arg);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
static int
|
||||
select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
|
||||
select_task_rq_stop(struct task_struct *p, int cpu, int flags)
|
||||
{
|
||||
return task_cpu(p); /* stop tasks as never migrate */
|
||||
}
|
||||
|
@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq)
|
|||
/*
|
||||
* Simple, special scheduling class for the per-CPU stop tasks:
|
||||
*/
|
||||
const struct sched_class stop_sched_class
|
||||
__section("__stop_sched_class") = {
|
||||
DEFINE_SCHED_CLASS(stop) = {
|
||||
|
||||
.enqueue_task = enqueue_task_stop,
|
||||
.dequeue_task = dequeue_task_stop,
|
||||
|
|
|
@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1;
|
|||
DEFINE_MUTEX(sched_energy_mutex);
|
||||
bool sched_energy_update;
|
||||
|
||||
void rebuild_sched_domains_energy(void)
|
||||
{
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = true;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = false;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
int sched_energy_aware_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
|
@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write,
|
|||
ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
if (!ret && write) {
|
||||
state = static_branch_unlikely(&sched_energy_present);
|
||||
if (state != sysctl_sched_energy_aware) {
|
||||
mutex_lock(&sched_energy_mutex);
|
||||
sched_energy_update = 1;
|
||||
rebuild_sched_domains();
|
||||
sched_energy_update = 0;
|
||||
mutex_unlock(&sched_energy_mutex);
|
||||
}
|
||||
if (state != sysctl_sched_energy_aware)
|
||||
rebuild_sched_domains_energy();
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
@ -324,6 +328,7 @@ static void sched_energy_set(bool has_eas)
|
|||
* 3. no SMT is detected.
|
||||
* 4. the EM complexity is low enough to keep scheduling overheads low;
|
||||
* 5. schedutil is driving the frequency of all CPUs of the rd;
|
||||
* 6. frequency invariance support is present;
|
||||
*
|
||||
* The complexity of the Energy Model is defined as:
|
||||
*
|
||||
|
@ -372,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map)
|
|||
goto free;
|
||||
}
|
||||
|
||||
if (!arch_scale_freq_invariant()) {
|
||||
if (sched_debug()) {
|
||||
pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported",
|
||||
cpumask_pr_args(cpu_map));
|
||||
}
|
||||
goto free;
|
||||
}
|
||||
|
||||
for_each_cpu(i, cpu_map) {
|
||||
/* Skip already covered CPUs. */
|
||||
if (find_pd(pd, i))
|
||||
|
@ -516,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd)
|
|||
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
|
||||
#endif
|
||||
|
||||
rd->visit_gen = 0;
|
||||
init_dl_bw(&rd->dl_bw);
|
||||
if (cpudl_init(&rd->cpudl) != 0)
|
||||
goto free_rto_mask;
|
||||
|
@ -674,6 +688,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
|||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct sched_domain *tmp;
|
||||
int numa_distance = 0;
|
||||
|
||||
/* Remove the sched domains which do not contribute to scheduling. */
|
||||
for (tmp = sd; tmp; ) {
|
||||
|
@ -705,6 +720,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
|||
sd->child = NULL;
|
||||
}
|
||||
|
||||
for (tmp = sd; tmp; tmp = tmp->parent)
|
||||
numa_distance += !!(tmp->flags & SD_NUMA);
|
||||
|
||||
/*
|
||||
* FIXME: Diameter >=3 is misrepresented.
|
||||
*
|
||||
* Smallest diameter=3 topology is:
|
||||
*
|
||||
* node 0 1 2 3
|
||||
* 0: 10 20 30 40
|
||||
* 1: 20 10 20 30
|
||||
* 2: 30 20 10 20
|
||||
* 3: 40 30 20 10
|
||||
*
|
||||
* 0 --- 1 --- 2 --- 3
|
||||
*
|
||||
* NUMA-3 0-3 N/A N/A 0-3
|
||||
* groups: {0-2},{1-3} {1-3},{0-2}
|
||||
*
|
||||
* NUMA-2 0-2 0-3 0-3 1-3
|
||||
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
|
||||
*
|
||||
* NUMA-1 0-1 0-2 1-3 2-3
|
||||
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
|
||||
*
|
||||
* NUMA-0 0 1 2 3
|
||||
*
|
||||
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
|
||||
* group span isn't a subset of the domain span.
|
||||
*/
|
||||
WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
|
||||
|
||||
sched_domain_debug(sd, cpu);
|
||||
|
||||
rq_attach_root(rq, rd);
|
||||
|
|
|
@ -42,11 +42,27 @@ struct cpu_stopper {
|
|||
struct list_head works; /* list of pending works */
|
||||
|
||||
struct cpu_stop_work stop_work; /* for stop_cpus */
|
||||
unsigned long caller;
|
||||
cpu_stop_fn_t fn;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
|
||||
static bool stop_machine_initialized = false;
|
||||
|
||||
void print_stop_info(const char *log_lvl, struct task_struct *task)
|
||||
{
|
||||
/*
|
||||
* If @task is a stopper task, it cannot migrate and task_cpu() is
|
||||
* stable.
|
||||
*/
|
||||
struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task));
|
||||
|
||||
if (task != stopper->thread)
|
||||
return;
|
||||
|
||||
printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller);
|
||||
}
|
||||
|
||||
/* static data for stop_cpus */
|
||||
static DEFINE_MUTEX(stop_cpus_mutex);
|
||||
static bool stop_cpus_in_progress;
|
||||
|
@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
|
|||
int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
|
||||
{
|
||||
struct cpu_stop_done done;
|
||||
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
|
||||
struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ };
|
||||
|
||||
cpu_stop_init_done(&done, 1);
|
||||
if (!cpu_stop_queue_work(cpu, &work))
|
||||
|
@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
|||
work1 = work2 = (struct cpu_stop_work){
|
||||
.fn = multi_cpu_stop,
|
||||
.arg = &msdata,
|
||||
.done = &done
|
||||
.done = &done,
|
||||
.caller = _RET_IP_,
|
||||
};
|
||||
|
||||
cpu_stop_init_done(&done, 2);
|
||||
|
@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *
|
|||
bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
|
||||
struct cpu_stop_work *work_buf)
|
||||
{
|
||||
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
|
||||
*work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, };
|
||||
return cpu_stop_queue_work(cpu, work_buf);
|
||||
}
|
||||
|
||||
|
@ -487,6 +504,8 @@ repeat:
|
|||
int ret;
|
||||
|
||||
/* cpu stop callbacks must not sleep, make in_atomic() == T */
|
||||
stopper->caller = work->caller;
|
||||
stopper->fn = fn;
|
||||
preempt_count_inc();
|
||||
ret = fn(arg);
|
||||
if (done) {
|
||||
|
@ -495,6 +514,8 @@ repeat:
|
|||
cpu_stop_signal_done(done);
|
||||
}
|
||||
preempt_count_dec();
|
||||
stopper->fn = NULL;
|
||||
stopper->caller = 0;
|
||||
WARN_ONCE(preempt_count(),
|
||||
"cpu_stop: %ps(%p) leaked preempt count\n", fn, arg);
|
||||
goto repeat;
|
||||
|
|
|
@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu)
|
|||
pool->flags |= POOL_DISASSOCIATED;
|
||||
|
||||
raw_spin_unlock_irq(&pool->lock);
|
||||
|
||||
for_each_pool_worker(worker, pool)
|
||||
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);
|
||||
|
||||
mutex_unlock(&wq_pool_attach_mutex);
|
||||
|
||||
/*
|
||||
|
|
|
@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p,
|
|||
return next;
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_any_and_distribute);
|
||||
|
||||
int cpumask_any_distribute(const struct cpumask *srcp)
|
||||
{
|
||||
int next, prev;
|
||||
|
||||
/* NOTE: our first selection will skip 0. */
|
||||
prev = __this_cpu_read(distribute_cpu_mask_prev);
|
||||
|
||||
next = cpumask_next(prev, srcp);
|
||||
if (next >= nr_cpu_ids)
|
||||
next = cpumask_first(srcp);
|
||||
|
||||
if (next < nr_cpu_ids)
|
||||
__this_cpu_write(distribute_cpu_mask_prev, next);
|
||||
|
||||
return next;
|
||||
}
|
||||
EXPORT_SYMBOL(cpumask_any_distribute);
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include <linux/atomic.h>
|
||||
#include <linux/kexec.h>
|
||||
#include <linux/utsname.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
||||
static char dump_stack_arch_desc_str[128];
|
||||
|
||||
|
@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl)
|
|||
log_lvl, dump_stack_arch_desc_str);
|
||||
|
||||
print_worker_info(log_lvl, current);
|
||||
print_stop_info(log_lvl, current);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2)
|
|||
if (current->nr_cpus_allowed == 1)
|
||||
goto out;
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
if (current->migration_disabled)
|
||||
goto out;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* It is valid to assume CPU-locality during early bootup:
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue