Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - power-aware scheduling improvements (Patrick Bellasi) - NUMA balancing improvements (Mel Gorman) - vCPU scheduling fixes (Rohit Jain) * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/fair: Update util_est before updating schedutil sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu() sched/wait: Include <linux/wait.h> in <linux/swait.h> sched/numa: Stagger NUMA balancing scan periods for new threads sched/core: Don't schedule threads on pre-empted vCPUs sched/fair: Avoid calling sync_entity_load_avg() unnecessarily sched/fair: Rearrange select_task_rq_fair() to optimize it
This commit is contained in:
commit
f7f4e7fc6c
|
@ -49,7 +49,7 @@ CONTENTS
|
|||
2.1 Main algorithm
|
||||
------------------
|
||||
|
||||
SCHED_DEADLINE uses three parameters, named "runtime", "period", and
|
||||
SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
|
||||
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive
|
||||
"runtime" microseconds of execution time every "period" microseconds, and
|
||||
these "runtime" microseconds are available within "deadline" microseconds
|
||||
|
@ -117,6 +117,10 @@ CONTENTS
|
|||
scheduling deadline = scheduling deadline + period
|
||||
remaining runtime = remaining runtime + runtime
|
||||
|
||||
The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
|
||||
to get informed about runtime overruns through the delivery of SIGXCPU
|
||||
signals.
|
||||
|
||||
|
||||
2.2 Bandwidth reclaiming
|
||||
------------------------
|
||||
|
@ -279,6 +283,19 @@ CONTENTS
|
|||
running_bw is incremented.
|
||||
|
||||
|
||||
2.3 Energy-aware scheduling
|
||||
------------------------
|
||||
|
||||
When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
|
||||
GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
|
||||
value that still allows to meet the deadlines. This behavior is currently
|
||||
implemented only for ARM architectures.
|
||||
|
||||
A particular care must be taken in case the time needed for changing frequency
|
||||
is of the same order of magnitude of the reservation period. In such cases,
|
||||
setting a fixed CPU frequency results in a lower amount of deadline misses.
|
||||
|
||||
|
||||
3. Scheduling Real-Time Tasks
|
||||
=============================
|
||||
|
||||
|
@ -505,6 +522,12 @@ CONTENTS
|
|||
17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
|
||||
or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
|
||||
Computing, 2016.
|
||||
18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
|
||||
Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
|
||||
2016.
|
||||
19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
|
||||
the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
|
||||
2018), Pau, France, April 2018.
|
||||
|
||||
|
||||
4. Bandwidth management
|
||||
|
|
|
@ -1512,6 +1512,7 @@ static inline int task_nice(const struct task_struct *p)
|
|||
extern int can_nice(const struct task_struct *p, const int nice);
|
||||
extern int task_curr(const struct task_struct *p);
|
||||
extern int idle_cpu(int cpu);
|
||||
extern int available_idle_cpu(int cpu);
|
||||
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
|
||||
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
|
||||
extern int sched_setattr(struct task_struct *, const struct sched_attr *);
|
||||
|
|
|
@ -5,6 +5,7 @@
|
|||
#include <linux/list.h>
|
||||
#include <linux/stddef.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/wait.h>
|
||||
#include <asm/current.h>
|
||||
|
||||
/*
|
||||
|
|
|
@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
|
|||
INIT_HLIST_HEAD(&p->preempt_notifiers);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
|
||||
p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
|
||||
p->mm->numa_scan_seq = 0;
|
||||
}
|
||||
|
||||
if (clone_flags & CLONE_VM)
|
||||
p->numa_preferred_nid = current->numa_preferred_nid;
|
||||
else
|
||||
p->numa_preferred_nid = -1;
|
||||
|
||||
p->node_stamp = 0ULL;
|
||||
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
|
||||
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
|
||||
p->numa_work.next = &p->numa_work;
|
||||
p->numa_faults = NULL;
|
||||
p->last_task_numa_placement = 0;
|
||||
p->last_sum_exec_runtime = 0;
|
||||
|
||||
p->numa_group = NULL;
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
init_numa_balancing(clone_flags, p);
|
||||
}
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
|
||||
|
@ -4049,6 +4029,23 @@ int idle_cpu(int cpu)
|
|||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* available_idle_cpu - is a given CPU idle for enqueuing work.
|
||||
* @cpu: the CPU in question.
|
||||
*
|
||||
* Return: 1 if the CPU is currently idle. 0 otherwise.
|
||||
*/
|
||||
int available_idle_cpu(int cpu)
|
||||
{
|
||||
if (!idle_cpu(cpu))
|
||||
return 0;
|
||||
|
||||
if (vcpu_is_preempted(cpu))
|
||||
return 0;
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
/**
|
||||
* idle_task - return the idle task for a given CPU.
|
||||
* @cpu: the processor in question.
|
||||
|
|
|
@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
|
|||
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(sg_cpu->cpu);
|
||||
unsigned long util;
|
||||
|
||||
if (rq->rt.rt_nr_running) {
|
||||
util = sg_cpu->max;
|
||||
} else {
|
||||
util = sg_cpu->util_dl;
|
||||
if (rq->cfs.h_nr_running)
|
||||
util += sg_cpu->util_cfs;
|
||||
}
|
||||
if (rq->rt.rt_nr_running)
|
||||
return sg_cpu->max;
|
||||
|
||||
/*
|
||||
* Utilization required by DEADLINE must always be granted while, for
|
||||
* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
|
||||
* gracefully reduce the frequency when no tasks show up for longer
|
||||
* periods of time.
|
||||
*
|
||||
* Ideally we would like to set util_dl as min/guaranteed freq and
|
||||
* util_cfs + util_dl as requested freq. However, cpufreq is not yet
|
||||
* ready for such an interface. So, we only do the latter for now.
|
||||
*/
|
||||
return min(util, sg_cpu->max);
|
||||
return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
|
||||
}
|
||||
|
||||
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)
|
||||
|
|
|
@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
|
|||
return max(smin, smax);
|
||||
}
|
||||
|
||||
void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
|
||||
{
|
||||
int mm_users = 0;
|
||||
struct mm_struct *mm = p->mm;
|
||||
|
||||
if (mm) {
|
||||
mm_users = atomic_read(&mm->mm_users);
|
||||
if (mm_users == 1) {
|
||||
mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
|
||||
mm->numa_scan_seq = 0;
|
||||
}
|
||||
}
|
||||
p->node_stamp = 0;
|
||||
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
|
||||
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
|
||||
p->numa_work.next = &p->numa_work;
|
||||
p->numa_faults = NULL;
|
||||
p->numa_group = NULL;
|
||||
p->last_task_numa_placement = 0;
|
||||
p->last_sum_exec_runtime = 0;
|
||||
|
||||
/* New address space, reset the preferred nid */
|
||||
if (!(clone_flags & CLONE_VM)) {
|
||||
p->numa_preferred_nid = -1;
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* New thread, keep existing numa_preferred_nid which should be copied
|
||||
* already by arch_dup_task_struct but stagger when scans start.
|
||||
*/
|
||||
if (mm) {
|
||||
unsigned int delay;
|
||||
|
||||
delay = min_t(unsigned int, task_scan_max(current),
|
||||
current->numa_scan_period * mm_users * NSEC_PER_MSEC);
|
||||
delay += 2 * TICK_NSEC;
|
||||
p->node_stamp = delay;
|
||||
}
|
||||
}
|
||||
|
||||
static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
|
||||
{
|
||||
rq->nr_numa_running += (p->numa_preferred_nid != -1);
|
||||
|
@ -5344,6 +5385,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|||
struct cfs_rq *cfs_rq;
|
||||
struct sched_entity *se = &p->se;
|
||||
|
||||
/*
|
||||
* The code below (indirectly) updates schedutil which looks at
|
||||
* the cfs_rq utilization to select a frequency.
|
||||
* Let's add the task's estimated utilization to the cfs_rq's
|
||||
* estimated utilization, before we update schedutil.
|
||||
*/
|
||||
util_est_enqueue(&rq->cfs, p);
|
||||
|
||||
/*
|
||||
* If in_iowait is set, the code below may not trigger any cpufreq
|
||||
* utilization updates, so do it here explicitly with the IOWAIT flag
|
||||
|
@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
|
|||
if (!se)
|
||||
add_nr_running(rq, 1);
|
||||
|
||||
util_est_enqueue(&rq->cfs, p);
|
||||
hrtick_update(rq);
|
||||
}
|
||||
|
||||
|
@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
|
|||
* a cpufreq perspective, it's better to have higher utilisation
|
||||
* on one CPU.
|
||||
*/
|
||||
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
|
||||
return idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
|
||||
if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
|
||||
return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
|
||||
|
||||
if (sync && cpu_rq(this_cpu)->nr_running == 1)
|
||||
return this_cpu;
|
||||
|
@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
|
|||
|
||||
/* Traverse only the allowed CPUs */
|
||||
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
|
||||
if (idle_cpu(i)) {
|
||||
if (available_idle_cpu(i)) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
struct cpuidle_state *idle = idle_get_state(rq);
|
||||
if (idle && idle->exit_latency < min_exit_latency) {
|
||||
|
@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
|
|||
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
|
||||
return prev_cpu;
|
||||
|
||||
/*
|
||||
* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
|
||||
* last_update_time.
|
||||
*/
|
||||
if (!(sd_flag & SD_BALANCE_FORK))
|
||||
sync_entity_load_avg(&p->se);
|
||||
|
||||
while (sd) {
|
||||
struct sched_group *group;
|
||||
struct sched_domain *tmp;
|
||||
|
@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
|
|||
if (cpu == core)
|
||||
continue;
|
||||
|
||||
if (!idle_cpu(cpu))
|
||||
if (!available_idle_cpu(cpu))
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
|
@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
|
|||
|
||||
for_each_cpu(cpu, cpu_smt_mask(core)) {
|
||||
cpumask_clear_cpu(cpu, cpus);
|
||||
if (!idle_cpu(cpu))
|
||||
if (!available_idle_cpu(cpu))
|
||||
idle = false;
|
||||
}
|
||||
|
||||
|
@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
|
|||
for_each_cpu(cpu, cpu_smt_mask(target)) {
|
||||
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
|
||||
continue;
|
||||
if (idle_cpu(cpu))
|
||||
if (available_idle_cpu(cpu))
|
||||
return cpu;
|
||||
}
|
||||
|
||||
|
@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
|||
return -1;
|
||||
if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
|
||||
continue;
|
||||
if (idle_cpu(cpu))
|
||||
if (available_idle_cpu(cpu))
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
|||
struct sched_domain *sd;
|
||||
int i, recent_used_cpu;
|
||||
|
||||
if (idle_cpu(target))
|
||||
if (available_idle_cpu(target))
|
||||
return target;
|
||||
|
||||
/*
|
||||
* If the previous CPU is cache affine and idle, don't be stupid:
|
||||
*/
|
||||
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
|
||||
if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
|
||||
return prev;
|
||||
|
||||
/* Check a recently used CPU as a potential idle candidate: */
|
||||
|
@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
|||
if (recent_used_cpu != prev &&
|
||||
recent_used_cpu != target &&
|
||||
cpus_share_cache(recent_used_cpu, target) &&
|
||||
idle_cpu(recent_used_cpu) &&
|
||||
available_idle_cpu(recent_used_cpu) &&
|
||||
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
|
||||
/*
|
||||
* Replace recent_used_cpu with prev as it is a potential
|
||||
|
@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|||
static int
|
||||
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
|
||||
{
|
||||
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
|
||||
struct sched_domain *tmp, *sd = NULL;
|
||||
int cpu = smp_processor_id();
|
||||
int new_cpu = prev_cpu;
|
||||
int want_affine = 0;
|
||||
|
@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|||
*/
|
||||
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
|
||||
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
|
||||
affine_sd = tmp;
|
||||
if (cpu != prev_cpu)
|
||||
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
|
||||
|
||||
sd = NULL; /* Prefer wake_affine over balance flags */
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
|
|||
break;
|
||||
}
|
||||
|
||||
if (affine_sd) {
|
||||
sd = NULL; /* Prefer wake_affine over balance flags */
|
||||
if (cpu == prev_cpu)
|
||||
goto pick_cpu;
|
||||
|
||||
new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
|
||||
}
|
||||
|
||||
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
|
||||
/*
|
||||
* We're going to need the task's util for capacity_spare_wake
|
||||
* in find_idlest_group. Sync it up to prev_cpu's
|
||||
* last_update_time.
|
||||
*/
|
||||
sync_entity_load_avg(&p->se);
|
||||
}
|
||||
|
||||
if (!sd) {
|
||||
pick_cpu:
|
||||
if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
|
||||
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
||||
|
||||
if (want_affine)
|
||||
current->recent_used_cpu = cpu;
|
||||
}
|
||||
} else {
|
||||
if (unlikely(sd)) {
|
||||
/* Slow path */
|
||||
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
|
||||
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
|
||||
/* Fast path */
|
||||
|
||||
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
|
||||
|
||||
if (want_affine)
|
||||
current->recent_used_cpu = cpu;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
|
|
|
@ -1069,6 +1069,12 @@ enum numa_faults_stats {
|
|||
extern void sched_setnuma(struct task_struct *p, int node);
|
||||
extern int migrate_task_to(struct task_struct *p, int cpu);
|
||||
extern int migrate_swap(struct task_struct *, struct task_struct *);
|
||||
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
|
||||
#else
|
||||
static inline void
|
||||
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
|
Loading…
Reference in New Issue