Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - power-aware scheduling improvements (Patrick Bellasi)

 - NUMA balancing improvements (Mel Gorman)

 - vCPU scheduling fixes (Rohit Jain)

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/fair: Update util_est before updating schedutil
  sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization
  sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation
  sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu()
  sched/wait: Include <linux/wait.h> in <linux/swait.h>
  sched/numa: Stagger NUMA balancing scan periods for new threads
  sched/core: Don't schedule threads on pre-empted vCPUs
  sched/fair: Avoid calling sync_entity_load_avg() unnecessarily
  sched/fair: Rearrange select_task_rq_fair() to optimize it
This commit is contained in:
Linus Torvalds 2018-06-04 17:45:38 -07:00
commit f7f4e7fc6c
7 changed files with 138 additions and 70 deletions

View File

@ -49,7 +49,7 @@ CONTENTS
2.1 Main algorithm 2.1 Main algorithm
------------------ ------------------
SCHED_DEADLINE uses three parameters, named "runtime", "period", and SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and
"deadline", to schedule tasks. A SCHED_DEADLINE task should receive "deadline", to schedule tasks. A SCHED_DEADLINE task should receive
"runtime" microseconds of execution time every "period" microseconds, and "runtime" microseconds of execution time every "period" microseconds, and
these "runtime" microseconds are available within "deadline" microseconds these "runtime" microseconds are available within "deadline" microseconds
@ -117,6 +117,10 @@ CONTENTS
scheduling deadline = scheduling deadline + period scheduling deadline = scheduling deadline + period
remaining runtime = remaining runtime + runtime remaining runtime = remaining runtime + runtime
The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task
to get informed about runtime overruns through the delivery of SIGXCPU
signals.
2.2 Bandwidth reclaiming 2.2 Bandwidth reclaiming
------------------------ ------------------------
@ -279,6 +283,19 @@ CONTENTS
running_bw is incremented. running_bw is incremented.
2.3 Energy-aware scheduling
------------------------
When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the
GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum
value that still allows to meet the deadlines. This behavior is currently
implemented only for ARM architectures.
A particular care must be taken in case the time needed for changing frequency
is of the same order of magnitude of the reservation period. In such cases,
setting a fixed CPU frequency results in a lower amount of deadline misses.
3. Scheduling Real-Time Tasks 3. Scheduling Real-Time Tasks
============================= =============================
@ -505,6 +522,12 @@ CONTENTS
17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel
or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied
Computing, 2016. Computing, 2016.
18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the
Linux kernel, Software: Practice and Experience, 46(6): 821-839, June
2016.
19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in
the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC
2018), Pau, France, April 2018.
4. Bandwidth management 4. Bandwidth management

View File

@ -1512,6 +1512,7 @@ static inline int task_nice(const struct task_struct *p)
extern int can_nice(const struct task_struct *p, const int nice); extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p); extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu); extern int idle_cpu(int cpu);
extern int available_idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *);
extern int sched_setattr(struct task_struct *, const struct sched_attr *); extern int sched_setattr(struct task_struct *, const struct sched_attr *);

View File

@ -5,6 +5,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/stddef.h> #include <linux/stddef.h>
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/wait.h>
#include <asm/current.h> #include <asm/current.h>
/* /*

View File

@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
INIT_HLIST_HEAD(&p->preempt_notifiers); INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif #endif
#ifdef CONFIG_NUMA_BALANCING init_numa_balancing(clone_flags, p);
if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
p->mm->numa_scan_seq = 0;
}
if (clone_flags & CLONE_VM)
p->numa_preferred_nid = current->numa_preferred_nid;
else
p->numa_preferred_nid = -1;
p->node_stamp = 0ULL;
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
} }
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@ -4049,6 +4029,23 @@ int idle_cpu(int cpu)
return 1; return 1;
} }
/**
* available_idle_cpu - is a given CPU idle for enqueuing work.
* @cpu: the CPU in question.
*
* Return: 1 if the CPU is currently idle. 0 otherwise.
*/
int available_idle_cpu(int cpu)
{
if (!idle_cpu(cpu))
return 0;
if (vcpu_is_preempted(cpu))
return 0;
return 1;
}
/** /**
* idle_task - return the idle task for a given CPU. * idle_task - return the idle task for a given CPU.
* @cpu: the processor in question. * @cpu: the processor in question.

View File

@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
{ {
struct rq *rq = cpu_rq(sg_cpu->cpu); struct rq *rq = cpu_rq(sg_cpu->cpu);
unsigned long util;
if (rq->rt.rt_nr_running) { if (rq->rt.rt_nr_running)
util = sg_cpu->max; return sg_cpu->max;
} else {
util = sg_cpu->util_dl;
if (rq->cfs.h_nr_running)
util += sg_cpu->util_cfs;
}
/* /*
* Utilization required by DEADLINE must always be granted while, for
* FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
* gracefully reduce the frequency when no tasks show up for longer
* periods of time.
*
* Ideally we would like to set util_dl as min/guaranteed freq and * Ideally we would like to set util_dl as min/guaranteed freq and
* util_cfs + util_dl as requested freq. However, cpufreq is not yet * util_cfs + util_dl as requested freq. However, cpufreq is not yet
* ready for such an interface. So, we only do the latter for now. * ready for such an interface. So, we only do the latter for now.
*/ */
return min(util, sg_cpu->max); return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
} }
static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags)

View File

@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p)
return max(smin, smax); return max(smin, smax);
} }
void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
int mm_users = 0;
struct mm_struct *mm = p->mm;
if (mm) {
mm_users = atomic_read(&mm->mm_users);
if (mm_users == 1) {
mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
mm->numa_scan_seq = 0;
}
}
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
p->numa_group = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
/* New address space, reset the preferred nid */
if (!(clone_flags & CLONE_VM)) {
p->numa_preferred_nid = -1;
return;
}
/*
* New thread, keep existing numa_preferred_nid which should be copied
* already by arch_dup_task_struct but stagger when scans start.
*/
if (mm) {
unsigned int delay;
delay = min_t(unsigned int, task_scan_max(current),
current->numa_scan_period * mm_users * NSEC_PER_MSEC);
delay += 2 * TICK_NSEC;
p->node_stamp = delay;
}
}
static void account_numa_enqueue(struct rq *rq, struct task_struct *p) static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
{ {
rq->nr_numa_running += (p->numa_preferred_nid != -1); rq->nr_numa_running += (p->numa_preferred_nid != -1);
@ -5344,6 +5385,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq; struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se; struct sched_entity *se = &p->se;
/*
* The code below (indirectly) updates schedutil which looks at
* the cfs_rq utilization to select a frequency.
* Let's add the task's estimated utilization to the cfs_rq's
* estimated utilization, before we update schedutil.
*/
util_est_enqueue(&rq->cfs, p);
/* /*
* If in_iowait is set, the code below may not trigger any cpufreq * If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag * utilization updates, so do it here explicitly with the IOWAIT flag
@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) if (!se)
add_nr_running(rq, 1); add_nr_running(rq, 1);
util_est_enqueue(&rq->cfs, p);
hrtick_update(rq); hrtick_update(rq);
} }
@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
* a cpufreq perspective, it's better to have higher utilisation * a cpufreq perspective, it's better to have higher utilisation
* on one CPU. * on one CPU.
*/ */
if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu))
return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu;
if (sync && cpu_rq(this_cpu)->nr_running == 1) if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu; return this_cpu;
@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this
/* Traverse only the allowed CPUs */ /* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) {
if (idle_cpu(i)) { if (available_idle_cpu(i)) {
struct rq *rq = cpu_rq(i); struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq); struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) { if (idle && idle->exit_latency < min_exit_latency) {
@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
return prev_cpu; return prev_cpu;
/*
* We need task's util for capacity_spare_wake, sync it up to prev_cpu's
* last_update_time.
*/
if (!(sd_flag & SD_BALANCE_FORK))
sync_entity_load_avg(&p->se);
while (sd) { while (sd) {
struct sched_group *group; struct sched_group *group;
struct sched_domain *tmp; struct sched_domain *tmp;
@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq)
if (cpu == core) if (cpu == core)
continue; continue;
if (!idle_cpu(cpu)) if (!available_idle_cpu(cpu))
goto unlock; goto unlock;
} }
@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
for_each_cpu(cpu, cpu_smt_mask(core)) { for_each_cpu(cpu, cpu_smt_mask(core)) {
cpumask_clear_cpu(cpu, cpus); cpumask_clear_cpu(cpu, cpus);
if (!idle_cpu(cpu)) if (!available_idle_cpu(cpu))
idle = false; idle = false;
} }
@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
for_each_cpu(cpu, cpu_smt_mask(target)) { for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue; continue;
if (idle_cpu(cpu)) if (available_idle_cpu(cpu))
return cpu; return cpu;
} }
@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
return -1; return -1;
if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
continue; continue;
if (idle_cpu(cpu)) if (available_idle_cpu(cpu))
break; break;
} }
@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
struct sched_domain *sd; struct sched_domain *sd;
int i, recent_used_cpu; int i, recent_used_cpu;
if (idle_cpu(target)) if (available_idle_cpu(target))
return target; return target;
/* /*
* If the previous CPU is cache affine and idle, don't be stupid: * If the previous CPU is cache affine and idle, don't be stupid:
*/ */
if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev))
return prev; return prev;
/* Check a recently used CPU as a potential idle candidate: */ /* Check a recently used CPU as a potential idle candidate: */
@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if (recent_used_cpu != prev && if (recent_used_cpu != prev &&
recent_used_cpu != target && recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) && cpus_share_cache(recent_used_cpu, target) &&
idle_cpu(recent_used_cpu) && available_idle_cpu(recent_used_cpu) &&
cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
/* /*
* Replace recent_used_cpu with prev as it is a potential * Replace recent_used_cpu with prev as it is a potential
@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
static int static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{ {
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; struct sched_domain *tmp, *sd = NULL;
int cpu = smp_processor_id(); int cpu = smp_processor_id();
int new_cpu = prev_cpu; int new_cpu = prev_cpu;
int want_affine = 0; int want_affine = 0;
@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
*/ */
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp; if (cpu != prev_cpu)
new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync);
sd = NULL; /* Prefer wake_affine over balance flags */
break; break;
} }
@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
break; break;
} }
if (affine_sd) { if (unlikely(sd)) {
sd = NULL; /* Prefer wake_affine over balance flags */ /* Slow path */
if (cpu == prev_cpu)
goto pick_cpu;
new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
}
if (sd && !(sd_flag & SD_BALANCE_FORK)) {
/*
* We're going to need the task's util for capacity_spare_wake
* in find_idlest_group. Sync it up to prev_cpu's
* last_update_time.
*/
sync_entity_load_avg(&p->se);
}
if (!sd) {
pick_cpu:
if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
current->recent_used_cpu = cpu;
}
} else {
new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
} else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
if (want_affine)
current->recent_used_cpu = cpu;
} }
rcu_read_unlock(); rcu_read_unlock();

View File

@ -1069,6 +1069,12 @@ enum numa_faults_stats {
extern void sched_setnuma(struct task_struct *p, int node); extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *); extern int migrate_swap(struct task_struct *, struct task_struct *);
extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
#else
static inline void
init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
{
}
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP