Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The biggest changes in this cycle were:

   - Make kcpustat vtime aware (Frederic Weisbecker)

   - Rework the CFS load_balance() logic (Vincent Guittot)

   - Misc cleanups, smaller enhancements, fixes.

  The load-balancing rework is the most intrusive change: it replaces
  the old heuristics that have become less meaningful after the
  introduction of the PELT metrics, with a grounds-up load-balancing
  algorithm.

  As such it's not really an iterative series, but replaces the old
  load-balancing logic with the new one. We hope there are no
  performance regressions left - but statistically it's highly probable
  that there *is* going to be some workload that is hurting from these
  chnages. If so then we'd prefer to have a look at that workload and
  fix its scheduling, instead of reverting the changes"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits)
  rackmeter: Use vtime aware kcpustat accessor
  leds: Use all-in-one vtime aware kcpustat accessor
  cpufreq: Use vtime aware kcpustat accessors for user time
  procfs: Use all-in-one vtime aware kcpustat accessor
  sched/vtime: Bring up complete kcpustat accessor
  sched/cputime: Support other fields on kcpustat_field()
  sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util()
  sched/fair: Add comments for group_type and balancing at SD_NUMA level
  sched/fair: Fix rework of find_idlest_group()
  sched/uclamp: Fix overzealous type replacement
  sched/Kconfig: Fix spelling mistake in user-visible help text
  sched/core: Further clarify sched_class::set_next_task()
  sched/fair: Use mul_u32_u32()
  sched/core: Simplify sched_class::pick_next_task()
  sched/core: Optimize pick_next_task()
  sched/core: Make pick_next_task_idle() more consistent
  sched/fair: Better document newidle_balance()
  leds: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
  cpufreq: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
  procfs: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
  ...
This commit is contained in:
Linus Torvalds 2019-11-26 15:23:14 -08:00
commit 77a05940ee
28 changed files with 1334 additions and 758 deletions

View File

@ -132,7 +132,7 @@ static __u64 vtime_delta(struct task_struct *tsk)
return delta_stime;
}
void vtime_account_system(struct task_struct *tsk)
void vtime_account_kernel(struct task_struct *tsk)
{
struct thread_info *ti = task_thread_info(tsk);
__u64 stime = vtime_delta(tsk);
@ -146,7 +146,7 @@ void vtime_account_system(struct task_struct *tsk)
else
ti->stime += stime;
}
EXPORT_SYMBOL_GPL(vtime_account_system);
EXPORT_SYMBOL_GPL(vtime_account_kernel);
void vtime_account_idle(struct task_struct *tsk)
{

View File

@ -338,7 +338,7 @@ static unsigned long vtime_delta(struct task_struct *tsk,
return stime;
}
void vtime_account_system(struct task_struct *tsk)
void vtime_account_kernel(struct task_struct *tsk)
{
unsigned long stime, stime_scaled, steal_time;
struct cpu_accounting_data *acct = get_accounting(tsk);
@ -366,7 +366,7 @@ void vtime_account_system(struct task_struct *tsk)
#endif
}
}
EXPORT_SYMBOL_GPL(vtime_account_system);
EXPORT_SYMBOL_GPL(vtime_account_kernel);
void vtime_account_idle(struct task_struct *tsk)
{
@ -395,7 +395,7 @@ static void vtime_flush_scaled(struct task_struct *tsk,
/*
* Account the whole cputime accumulated in the paca
* Must be called with interrupts disabled.
* Assumes that vtime_account_system/idle() has been called
* Assumes that vtime_account_kernel/idle() has been called
* recently (i.e. since the last entry from usermode) so that
* get_paca()->user_time_scaled is up to date.
*/

View File

@ -247,9 +247,9 @@ void vtime_account_irq_enter(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
void vtime_account_system(struct task_struct *tsk)
void vtime_account_kernel(struct task_struct *tsk)
__attribute__((alias("vtime_account_irq_enter")));
EXPORT_SYMBOL_GPL(vtime_account_system);
EXPORT_SYMBOL_GPL(vtime_account_kernel);
/*
* Sorted add to a list. List is linear searched until first bigger

View File

@ -354,7 +354,7 @@ For 32-bit we have the following conventions - kernel is built with
.macro CALL_enter_from_user_mode
#ifdef CONFIG_CONTEXT_TRACKING
#ifdef CONFIG_JUMP_LABEL
STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0
STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_key, def=0
#endif
call enter_from_user_mode
.Lafter_call_\@:

View File

@ -113,18 +113,21 @@ EXPORT_SYMBOL_GPL(get_governor_parent_kobj);
static inline u64 get_cpu_idle_time_jiffy(unsigned int cpu, u64 *wall)
{
u64 idle_time;
struct kernel_cpustat kcpustat;
u64 cur_wall_time;
u64 idle_time;
u64 busy_time;
cur_wall_time = jiffies64_to_nsecs(get_jiffies_64());
busy_time = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_IRQ];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_SOFTIRQ];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_STEAL];
busy_time += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
kcpustat_cpu_fetch(&kcpustat, cpu);
busy_time = kcpustat.cpustat[CPUTIME_USER];
busy_time += kcpustat.cpustat[CPUTIME_SYSTEM];
busy_time += kcpustat.cpustat[CPUTIME_IRQ];
busy_time += kcpustat.cpustat[CPUTIME_SOFTIRQ];
busy_time += kcpustat.cpustat[CPUTIME_STEAL];
busy_time += kcpustat.cpustat[CPUTIME_NICE];
idle_time = cur_wall_time - busy_time;
if (wall)

View File

@ -105,7 +105,7 @@ void gov_update_cpu_data(struct dbs_data *dbs_data)
j_cdbs->prev_cpu_idle = get_cpu_idle_time(j, &j_cdbs->prev_update_time,
dbs_data->io_is_busy);
if (dbs_data->ignore_nice_load)
j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
j_cdbs->prev_cpu_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
}
}
}
@ -149,7 +149,7 @@ unsigned int dbs_update(struct cpufreq_policy *policy)
j_cdbs->prev_cpu_idle = cur_idle_time;
if (ignore_nice) {
u64 cur_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
u64 cur_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
idle_time += div_u64(cur_nice - j_cdbs->prev_cpu_nice, NSEC_PER_USEC);
j_cdbs->prev_cpu_nice = cur_nice;
@ -530,7 +530,7 @@ int cpufreq_dbs_governor_start(struct cpufreq_policy *policy)
j_cdbs->prev_load = 0;
if (ignore_nice)
j_cdbs->prev_cpu_nice = kcpustat_cpu(j).cpustat[CPUTIME_NICE];
j_cdbs->prev_cpu_nice = kcpustat_field(&kcpustat_cpu(j), CPUTIME_NICE, j);
}
gov->start(policy);

View File

@ -57,11 +57,15 @@ static void led_activity_function(struct timer_list *t)
curr_used = 0;
for_each_possible_cpu(i) {
curr_used += kcpustat_cpu(i).cpustat[CPUTIME_USER]
+ kcpustat_cpu(i).cpustat[CPUTIME_NICE]
+ kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]
+ kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ]
+ kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
struct kernel_cpustat kcpustat;
kcpustat_cpu_fetch(&kcpustat, i);
curr_used += kcpustat.cpustat[CPUTIME_USER]
+ kcpustat.cpustat[CPUTIME_NICE]
+ kcpustat.cpustat[CPUTIME_SYSTEM]
+ kcpustat.cpustat[CPUTIME_SOFTIRQ]
+ kcpustat.cpustat[CPUTIME_IRQ];
cpus++;
}

View File

@ -81,13 +81,14 @@ static int rackmeter_ignore_nice;
*/
static inline u64 get_cpu_idle_time(unsigned int cpu)
{
struct kernel_cpustat *kcpustat = &kcpustat_cpu(cpu);
u64 retval;
retval = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE] +
kcpustat_cpu(cpu).cpustat[CPUTIME_IOWAIT];
retval = kcpustat->cpustat[CPUTIME_IDLE] +
kcpustat->cpustat[CPUTIME_IOWAIT];
if (rackmeter_ignore_nice)
retval += kcpustat_cpu(cpu).cpustat[CPUTIME_NICE];
retval += kcpustat_field(kcpustat, CPUTIME_NICE, cpu);
return retval;
}

View File

@ -120,20 +120,23 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
struct kernel_cpustat *kcs = &kcpustat_cpu(i);
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
user += kcs->cpustat[CPUTIME_USER];
nice += kcs->cpustat[CPUTIME_NICE];
system += kcs->cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(kcs, i);
iowait += get_iowait_time(kcs, i);
irq += kcs->cpustat[CPUTIME_IRQ];
softirq += kcs->cpustat[CPUTIME_SOFTIRQ];
steal += kcs->cpustat[CPUTIME_STEAL];
guest += kcs->cpustat[CPUTIME_GUEST];
guest_nice += kcs->cpustat[CPUTIME_GUEST_NICE];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
kcpustat_cpu_fetch(&kcpustat, i);
user += cpustat[CPUTIME_USER];
nice += cpustat[CPUTIME_NICE];
system += cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(&kcpustat, i);
iowait += get_iowait_time(&kcpustat, i);
irq += cpustat[CPUTIME_IRQ];
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
guest += cpustat[CPUTIME_GUEST];
guest_nice += cpustat[CPUTIME_USER];
sum += kstat_cpu_irqs_sum(i);
sum += arch_irq_stat_cpu(i);
for (j = 0; j < NR_SOFTIRQS; j++) {
unsigned int softirq_stat = kstat_softirqs_cpu(j, i);
@ -157,19 +160,22 @@ static int show_stat(struct seq_file *p, void *v)
seq_putc(p, '\n');
for_each_online_cpu(i) {
struct kernel_cpustat *kcs = &kcpustat_cpu(i);
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
kcpustat_cpu_fetch(&kcpustat, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = kcs->cpustat[CPUTIME_USER];
nice = kcs->cpustat[CPUTIME_NICE];
system = kcs->cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(kcs, i);
iowait = get_iowait_time(kcs, i);
irq = kcs->cpustat[CPUTIME_IRQ];
softirq = kcs->cpustat[CPUTIME_SOFTIRQ];
steal = kcs->cpustat[CPUTIME_STEAL];
guest = kcs->cpustat[CPUTIME_GUEST];
guest_nice = kcs->cpustat[CPUTIME_GUEST_NICE];
user = cpustat[CPUTIME_USER];
nice = cpustat[CPUTIME_NICE];
system = cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(&kcpustat, i);
iowait = get_iowait_time(&kcpustat, i);
irq = cpustat[CPUTIME_IRQ];
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
guest = cpustat[CPUTIME_GUEST];
guest_nice = cpustat[CPUTIME_USER];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));

View File

@ -22,26 +22,26 @@ extern void context_tracking_user_exit(void);
static inline void user_enter(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
context_tracking_enter(CONTEXT_USER);
}
static inline void user_exit(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
context_tracking_exit(CONTEXT_USER);
}
/* Called with interrupts disabled. */
static inline void user_enter_irqoff(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_USER);
}
static inline void user_exit_irqoff(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_USER);
}
@ -49,7 +49,7 @@ static inline enum ctx_state exception_enter(void)
{
enum ctx_state prev_ctx;
if (!context_tracking_is_enabled())
if (!context_tracking_enabled())
return 0;
prev_ctx = this_cpu_read(context_tracking.state);
@ -61,7 +61,7 @@ static inline enum ctx_state exception_enter(void)
static inline void exception_exit(enum ctx_state prev_ctx)
{
if (context_tracking_is_enabled()) {
if (context_tracking_enabled()) {
if (prev_ctx != CONTEXT_KERNEL)
context_tracking_enter(prev_ctx);
}
@ -77,7 +77,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
*/
static inline enum ctx_state ct_state(void)
{
return context_tracking_is_enabled() ?
return context_tracking_enabled() ?
this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
}
#else
@ -90,7 +90,7 @@ static inline void exception_exit(enum ctx_state prev_ctx) { }
static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
#endif /* !CONFIG_CONTEXT_TRACKING */
#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
#define CT_WARN_ON(cond) WARN_ON(context_tracking_enabled() && (cond))
#ifdef CONFIG_CONTEXT_TRACKING_FORCE
extern void context_tracking_init(void);
@ -103,12 +103,12 @@ static inline void context_tracking_init(void) { }
/* must be called with irqs disabled */
static inline void guest_enter_irqoff(void)
{
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
vtime_guest_enter(current);
else
current->flags |= PF_VCPU;
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_enter(CONTEXT_GUEST);
/* KVM does not hold any references to rcu protected data when it
@ -118,16 +118,16 @@ static inline void guest_enter_irqoff(void)
* one time slice). Lets treat guest mode as quiescent state, just like
* we do with user-mode execution.
*/
if (!context_tracking_cpu_is_enabled())
if (!context_tracking_enabled_this_cpu())
rcu_virt_note_context_switch(smp_processor_id());
}
static inline void guest_exit_irqoff(void)
{
if (context_tracking_is_enabled())
if (context_tracking_enabled())
__context_tracking_exit(CONTEXT_GUEST);
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
vtime_guest_exit(current);
else
current->flags &= ~PF_VCPU;
@ -141,7 +141,7 @@ static inline void guest_enter_irqoff(void)
* to assume that it's the stime pending cputime
* to flush.
*/
vtime_account_system(current);
vtime_account_kernel(current);
current->flags |= PF_VCPU;
rcu_virt_note_context_switch(smp_processor_id());
}
@ -149,7 +149,7 @@ static inline void guest_enter_irqoff(void)
static inline void guest_exit_irqoff(void)
{
/* Flush the guest cputime we spent on the guest */
vtime_account_system(current);
vtime_account_kernel(current);
current->flags &= ~PF_VCPU;
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

View File

@ -23,17 +23,22 @@ struct context_tracking {
};
#ifdef CONFIG_CONTEXT_TRACKING
extern struct static_key_false context_tracking_enabled;
extern struct static_key_false context_tracking_key;
DECLARE_PER_CPU(struct context_tracking, context_tracking);
static inline bool context_tracking_is_enabled(void)
static inline bool context_tracking_enabled(void)
{
return static_branch_unlikely(&context_tracking_enabled);
return static_branch_unlikely(&context_tracking_key);
}
static inline bool context_tracking_cpu_is_enabled(void)
static inline bool context_tracking_enabled_cpu(int cpu)
{
return __this_cpu_read(context_tracking.active);
return context_tracking_enabled() && per_cpu(context_tracking.active, cpu);
}
static inline bool context_tracking_enabled_this_cpu(void)
{
return context_tracking_enabled() && __this_cpu_read(context_tracking.active);
}
static inline bool context_tracking_in_user(void)
@ -42,9 +47,9 @@ static inline bool context_tracking_in_user(void)
}
#else
static inline bool context_tracking_in_user(void) { return false; }
static inline bool context_tracking_active(void) { return false; }
static inline bool context_tracking_is_enabled(void) { return false; }
static inline bool context_tracking_cpu_is_enabled(void) { return false; }
static inline bool context_tracking_enabled(void) { return false; }
static inline bool context_tracking_enabled_cpu(int cpu) { return false; }
static inline bool context_tracking_enabled_this_cpu(void) { return false; }
#endif /* CONFIG_CONTEXT_TRACKING */
#endif

View File

@ -78,6 +78,24 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
return kstat_cpu(cpu).irqs_sum;
}
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
extern u64 kcpustat_field(struct kernel_cpustat *kcpustat,
enum cpu_usage_stat usage, int cpu);
extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu);
#else
static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat,
enum cpu_usage_stat usage, int cpu)
{
return kcpustat->cpustat[usage];
}
static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
*dst = kcpustat_cpu(cpu);
}
#endif
extern void account_user_time(struct task_struct *, u64);
extern void account_guest_time(struct task_struct *, u64);
extern void account_system_time(struct task_struct *, int, u64);

View File

@ -250,16 +250,21 @@ struct prev_cputime {
enum vtime_state {
/* Task is sleeping or running in a CPU with VTIME inactive: */
VTIME_INACTIVE = 0,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task is idle */
VTIME_IDLE,
/* Task runs in kernelspace in a CPU with VTIME active: */
VTIME_SYS,
/* Task runs in userspace in a CPU with VTIME active: */
VTIME_USER,
/* Task runs as guests in a CPU with VTIME active: */
VTIME_GUEST,
};
struct vtime {
seqcount_t seqcount;
unsigned long long starttime;
enum vtime_state state;
unsigned int cpu;
u64 utime;
u64 stime;
u64 gtime;

View File

@ -174,7 +174,7 @@ extern cpumask_var_t tick_nohz_full_mask;
static inline bool tick_nohz_full_enabled(void)
{
if (!context_tracking_is_enabled())
if (!context_tracking_enabled())
return false;
return tick_nohz_full_running;

View File

@ -11,11 +11,15 @@
struct task_struct;
/*
* vtime_accounting_cpu_enabled() definitions/declarations
* vtime_accounting_enabled_this_cpu() definitions/declarations
*/
#if defined(CONFIG_VIRT_CPU_ACCOUNTING_NATIVE)
static inline bool vtime_accounting_cpu_enabled(void) { return true; }
static inline bool vtime_accounting_enabled_this_cpu(void) { return true; }
extern void vtime_task_switch(struct task_struct *prev);
#elif defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN)
/*
* Checks if vtime is enabled on some CPU. Cputime readers want to be careful
* in that case and compute the tickless cputime.
@ -24,46 +28,43 @@ static inline bool vtime_accounting_cpu_enabled(void) { return true; }
*/
static inline bool vtime_accounting_enabled(void)
{
return context_tracking_is_enabled();
return context_tracking_enabled();
}
static inline bool vtime_accounting_cpu_enabled(void)
static inline bool vtime_accounting_enabled_cpu(int cpu)
{
if (vtime_accounting_enabled()) {
if (context_tracking_cpu_is_enabled())
return true;
}
return false;
return context_tracking_enabled_cpu(cpu);
}
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline bool vtime_accounting_cpu_enabled(void) { return false; }
#endif
static inline bool vtime_accounting_enabled_this_cpu(void)
{
return context_tracking_enabled_this_cpu();
}
extern void vtime_task_switch_generic(struct task_struct *prev);
static inline void vtime_task_switch(struct task_struct *prev)
{
if (vtime_accounting_enabled_this_cpu())
vtime_task_switch_generic(prev);
}
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline bool vtime_accounting_enabled_cpu(int cpu) {return false; }
static inline bool vtime_accounting_enabled_this_cpu(void) { return false; }
static inline void vtime_task_switch(struct task_struct *prev) { }
#endif
/*
* Common vtime APIs
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef __ARCH_HAS_VTIME_TASK_SWITCH
extern void vtime_task_switch(struct task_struct *prev);
#else
extern void vtime_common_task_switch(struct task_struct *prev);
static inline void vtime_task_switch(struct task_struct *prev)
{
if (vtime_accounting_cpu_enabled())
vtime_common_task_switch(prev);
}
#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */
extern void vtime_account_system(struct task_struct *tsk);
extern void vtime_account_kernel(struct task_struct *tsk);
extern void vtime_account_idle(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
static inline void vtime_task_switch(struct task_struct *prev) { }
static inline void vtime_account_system(struct task_struct *tsk) { }
static inline void vtime_account_kernel(struct task_struct *tsk) { }
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@ -86,7 +87,7 @@ extern void vtime_account_irq_enter(struct task_struct *tsk);
static inline void vtime_account_irq_exit(struct task_struct *tsk)
{
/* On hard|softirq exit we always account to hard|softirq cputime */
vtime_account_system(tsk);
vtime_account_kernel(tsk);
}
extern void vtime_flush(struct task_struct *tsk);
#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */

View File

@ -65,7 +65,7 @@ config PREEMPT_RT
preemptible priority-inheritance aware variants, enforcing
interrupt threading and introducing mechanisms to break up long
non-preemptible sections. This makes the kernel, except for very
low level and critical code pathes (entry code, scheduler, low
low level and critical code paths (entry code, scheduler, low
level interrupt handling) fully preemptible and brings most
execution contexts under scheduler control.

View File

@ -25,8 +25,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
DEFINE_STATIC_KEY_FALSE(context_tracking_enabled);
EXPORT_SYMBOL_GPL(context_tracking_enabled);
DEFINE_STATIC_KEY_FALSE(context_tracking_key);
EXPORT_SYMBOL_GPL(context_tracking_key);
DEFINE_PER_CPU(struct context_tracking, context_tracking);
EXPORT_SYMBOL_GPL(context_tracking);
@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu)
if (!per_cpu(context_tracking.active, cpu)) {
per_cpu(context_tracking.active, cpu) = true;
static_branch_inc(&context_tracking_enabled);
static_branch_inc(&context_tracking_key);
}
if (initialized)

View File

@ -811,7 +811,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
}
static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id)
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
return 0;
@ -854,7 +854,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
}
static inline
enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id,
unsigned int clamp_value)
{
struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
@ -919,7 +919,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id)
return uc_req;
}
enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
{
struct uclamp_se uc_eff;
@ -3918,13 +3918,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
prev->sched_class == &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = fair_sched_class.pick_next_task(rq, prev, rf);
p = pick_next_task_fair(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
goto restart;
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
p = idle_sched_class.pick_next_task(rq, prev, rf);
if (!p) {
put_prev_task(rq, prev);
p = pick_next_task_idle(rq);
}
return p;
}
@ -3948,7 +3950,7 @@ restart:
put_prev_task(rq, prev);
for_each_class(class) {
p = class->pick_next_task(rq, NULL, NULL);
p = class->pick_next_task(rq);
if (p)
return p;
}
@ -6217,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
struct task_struct *next;
for_each_class(class) {
next = class->pick_next_task(rq, NULL, NULL);
next = class->pick_next_task(rq);
if (next) {
next->sched_class->put_prev_task(rq, next);
return next;

View File

@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_
/*
* Use precise platform statistics if available:
*/
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
void vtime_common_task_switch(struct task_struct *prev)
void vtime_task_switch(struct task_struct *prev)
{
if (is_idle_task(prev))
vtime_account_idle(prev);
else
vtime_account_system(prev);
vtime_account_kernel(prev);
vtime_flush(prev);
arch_vtime_task_switch(prev);
}
# endif
#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
* vtime_account_system() and vtime_account_idle(). Archs that
* vtime_account_kernel() and vtime_account_idle(). Archs that
* have other meaning of the idle time (s390 only includes the
* time spent by the CPU when it's in low power mode) must override
* vtime_account().
@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk)
if (!in_interrupt() && is_idle_task(tsk))
vtime_account_idle(tsk);
else
vtime_account_system(tsk);
vtime_account_kernel(tsk);
}
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
u64 cputime, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
return;
if (sched_clock_irqtime) {
@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime)
return delta - other;
}
static void __vtime_account_system(struct task_struct *tsk,
struct vtime *vtime)
static void vtime_account_system(struct task_struct *tsk,
struct vtime *vtime)
{
vtime->stime += get_vtime_delta(vtime);
if (vtime->stime >= TICK_NSEC) {
@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk,
}
}
void vtime_account_system(struct task_struct *tsk)
static void __vtime_account_kernel(struct task_struct *tsk,
struct vtime *vtime)
{
/* We might have scheduled out from guest path */
if (vtime->state == VTIME_GUEST)
vtime_account_guest(tsk, vtime);
else
vtime_account_system(tsk, vtime);
}
void vtime_account_kernel(struct task_struct *tsk)
{
struct vtime *vtime = &tsk->vtime;
@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk)
return;
write_seqcount_begin(&vtime->seqcount);
/* We might have scheduled out from guest path */
if (tsk->flags & PF_VCPU)
vtime_account_guest(tsk, vtime);
else
__vtime_account_system(tsk, vtime);
__vtime_account_kernel(tsk, vtime);
write_seqcount_end(&vtime->seqcount);
}
@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk)
struct vtime *vtime = &tsk->vtime;
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
vtime_account_system(tsk, vtime);
vtime->state = VTIME_USER;
write_seqcount_end(&vtime->seqcount);
}
@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&vtime->seqcount);
__vtime_account_system(tsk, vtime);
vtime_account_system(tsk, vtime);
tsk->flags |= PF_VCPU;
vtime->state = VTIME_GUEST;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk)
write_seqcount_begin(&vtime->seqcount);
vtime_account_guest(tsk, vtime);
tsk->flags &= ~PF_VCPU;
vtime->state = VTIME_SYS;
write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk)
account_idle_time(get_vtime_delta(&tsk->vtime));
}
void arch_vtime_task_switch(struct task_struct *prev)
void vtime_task_switch_generic(struct task_struct *prev)
{
struct vtime *vtime = &prev->vtime;
write_seqcount_begin(&vtime->seqcount);
if (vtime->state == VTIME_IDLE)
vtime_account_idle(prev);
else
__vtime_account_kernel(prev, vtime);
vtime->state = VTIME_INACTIVE;
vtime->cpu = -1;
write_seqcount_end(&vtime->seqcount);
vtime = &current->vtime;
write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS;
if (is_idle_task(current))
vtime->state = VTIME_IDLE;
else if (current->flags & PF_VCPU)
vtime->state = VTIME_GUEST;
else
vtime->state = VTIME_SYS;
vtime->starttime = sched_clock();
vtime->cpu = smp_processor_id();
write_seqcount_end(&vtime->seqcount);
}
@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&vtime->seqcount);
vtime->state = VTIME_SYS;
vtime->state = VTIME_IDLE;
vtime->starttime = sched_clock();
vtime->cpu = cpu;
write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t)
seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
if (vtime->state == VTIME_GUEST)
gtime += vtime->gtime + vtime_delta(vtime);
} while (read_seqcount_retry(&vtime->seqcount, seq));
@ -877,20 +895,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping, nothing to add */
if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
/* Task is sleeping or idle, nothing to add */
if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
*utime += vtime->utime + delta;
else if (vtime->state == VTIME_SYS)
if (vtime->state == VTIME_SYS)
*stime += vtime->stime + delta;
else
*utime += vtime->utime + delta;
} while (read_seqcount_retry(&vtime->seqcount, seq));
}
static int vtime_state_check(struct vtime *vtime, int cpu)
{
/*
* We raced against a context switch, fetch the
* kcpustat task again.
*/
if (vtime->cpu != cpu && vtime->cpu != -1)
return -EAGAIN;
/*
* Two possible things here:
* 1) We are seeing the scheduling out task (prev) or any past one.
* 2) We are seeing the scheduling in task (next) but it hasn't
* passed though vtime_task_switch() yet so the pending
* cputime of the prev task may not be flushed yet.
*
* Case 1) is ok but 2) is not. So wait for a safe VTIME state.
*/
if (vtime->state == VTIME_INACTIVE)
return -EAGAIN;
return 0;
}
static u64 kcpustat_user_vtime(struct vtime *vtime)
{
if (vtime->state == VTIME_USER)
return vtime->utime + vtime_delta(vtime);
else if (vtime->state == VTIME_GUEST)
return vtime->gtime + vtime_delta(vtime);
return 0;
}
static int kcpustat_field_vtime(u64 *cpustat,
struct task_struct *tsk,
enum cpu_usage_stat usage,
int cpu, u64 *val)
{
struct vtime *vtime = &tsk->vtime;
unsigned int seq;
int err;
do {
seq = read_seqcount_begin(&vtime->seqcount);
err = vtime_state_check(vtime, cpu);
if (err < 0)
return err;
*val = cpustat[usage];
/*
* Nice VS unnice cputime accounting may be inaccurate if
* the nice value has changed since the last vtime update.
* But proper fix would involve interrupting target on nice
* updates which is a no go on nohz_full (although the scheduler
* may still interrupt the target if rescheduling is needed...)
*/
switch (usage) {
case CPUTIME_SYSTEM:
if (vtime->state == VTIME_SYS)
*val += vtime->stime + vtime_delta(vtime);
break;
case CPUTIME_USER:
if (task_nice(tsk) <= 0)
*val += kcpustat_user_vtime(vtime);
break;
case CPUTIME_NICE:
if (task_nice(tsk) > 0)
*val += kcpustat_user_vtime(vtime);
break;
case CPUTIME_GUEST:
if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
case CPUTIME_GUEST_NICE:
if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0)
*val += vtime->gtime + vtime_delta(vtime);
break;
default:
break;
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
return 0;
}
u64 kcpustat_field(struct kernel_cpustat *kcpustat,
enum cpu_usage_stat usage, int cpu)
{
u64 *cpustat = kcpustat->cpustat;
struct rq *rq;
u64 val;
int err;
if (!vtime_accounting_enabled_cpu(cpu))
return cpustat[usage];
rq = cpu_rq(cpu);
for (;;) {
struct task_struct *curr;
rcu_read_lock();
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
return cpustat[usage];
}
err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val);
rcu_read_unlock();
if (!err)
return val;
cpu_relax();
}
}
EXPORT_SYMBOL_GPL(kcpustat_field);
static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst,
const struct kernel_cpustat *src,
struct task_struct *tsk, int cpu)
{
struct vtime *vtime = &tsk->vtime;
unsigned int seq;
int err;
do {
u64 *cpustat;
u64 delta;
seq = read_seqcount_begin(&vtime->seqcount);
err = vtime_state_check(vtime, cpu);
if (err < 0)
return err;
*dst = *src;
cpustat = dst->cpustat;
/* Task is sleeping, dead or idle, nothing to add */
if (vtime->state < VTIME_SYS)
continue;
delta = vtime_delta(vtime);
/*
* Task runs either in user (including guest) or kernel space,
* add pending nohz time to the right place.
*/
if (vtime->state == VTIME_SYS) {
cpustat[CPUTIME_SYSTEM] += vtime->stime + delta;
} else if (vtime->state == VTIME_USER) {
if (task_nice(tsk) > 0)
cpustat[CPUTIME_NICE] += vtime->utime + delta;
else
cpustat[CPUTIME_USER] += vtime->utime + delta;
} else {
WARN_ON_ONCE(vtime->state != VTIME_GUEST);
if (task_nice(tsk) > 0) {
cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta;
cpustat[CPUTIME_NICE] += vtime->gtime + delta;
} else {
cpustat[CPUTIME_GUEST] += vtime->gtime + delta;
cpustat[CPUTIME_USER] += vtime->gtime + delta;
}
}
} while (read_seqcount_retry(&vtime->seqcount, seq));
return err;
}
void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu)
{
const struct kernel_cpustat *src = &kcpustat_cpu(cpu);
struct rq *rq;
int err;
if (!vtime_accounting_enabled_cpu(cpu)) {
*dst = *src;
return;
}
rq = cpu_rq(cpu);
for (;;) {
struct task_struct *curr;
rcu_read_lock();
curr = rcu_dereference(rq->curr);
if (WARN_ON_ONCE(!curr)) {
rcu_read_unlock();
*dst = *src;
return;
}
err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu);
rcu_read_unlock();
if (!err)
return;
cpu_relax();
}
}
EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch);
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */

View File

@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
}
#endif
static void set_next_task_dl(struct rq *rq, struct task_struct *p)
static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
if (!first)
return;
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
@ -1770,22 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static struct task_struct *pick_next_task_dl(struct rq *rq)
{
struct sched_dl_entity *dl_se;
struct dl_rq *dl_rq = &rq->dl;
struct task_struct *p;
WARN_ON_ONCE(prev || rf);
if (!sched_dl_runnable(rq))
return NULL;
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
p = dl_task_of(dl_se);
set_next_task_dl(rq, p);
set_next_task_dl(rq, p, true);
return p;
}

File diff suppressed because it is too large Load Diff

View File

@ -89,3 +89,4 @@ SCHED_FEAT(WA_BIAS, true)
* UtilEstimation. Use estimated CPU utilization.
*/
SCHED_FEAT(UTIL_EST, true)
SCHED_FEAT(UTIL_EST_FASTUP, true)

View File

@ -385,21 +385,17 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
}
static void set_next_task_idle(struct rq *rq, struct task_struct *next)
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
}
static struct task_struct *
pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
struct task_struct *pick_next_task_idle(struct rq *rq)
{
struct task_struct *next = rq->idle;
if (prev)
put_prev_task(rq, prev);
set_next_task_idle(rq, next);
set_next_task_idle(rq, next, true);
return next;
}

View File

@ -1515,13 +1515,16 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
{
p->se.exec_start = rq_clock_task(rq);
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
if (!first)
return;
/*
* If prev task was rt, put_prev_task() has already updated the
* utilization. We only care of the case where we start to schedule a
@ -1564,18 +1567,15 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
return rt_task_of(rt_se);
}
static struct task_struct *
pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static struct task_struct *pick_next_task_rt(struct rq *rq)
{
struct task_struct *p;
WARN_ON_ONCE(prev || rf);
if (!sched_rt_runnable(rq))
return NULL;
p = _pick_next_task_rt(rq);
set_next_task_rt(rq, p);
set_next_task_rt(rq, p, true);
return p;
}

View File

@ -1713,22 +1713,10 @@ struct sched_class {
void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
/*
* Both @prev and @rf are optional and may be NULL, in which case the
* caller must already have invoked put_prev_task(rq, prev, rf).
*
* Otherwise it is the responsibility of the pick_next_task() to call
* put_prev_task() on the @prev task or something equivalent, IFF it
* returns a next task.
*
* In that case (@rf != NULL) it may return RETRY_TASK when it finds a
* higher prio class has runnable tasks.
*/
struct task_struct * (*pick_next_task)(struct rq *rq,
struct task_struct *prev,
struct rq_flags *rf);
struct task_struct *(*pick_next_task)(struct rq *rq);
void (*put_prev_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p);
void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first);
#ifdef CONFIG_SMP
int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
@ -1780,7 +1768,7 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
static inline void set_next_task(struct rq *rq, struct task_struct *next)
{
WARN_ON_ONCE(rq->curr != next);
next->sched_class->set_next_task(rq, next);
next->sched_class->set_next_task(rq, next, false);
}
#ifdef CONFIG_SMP
@ -1821,6 +1809,9 @@ static inline bool sched_fair_runnable(struct rq *rq)
return rq->cfs.nr_running > 0;
}
extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf);
extern struct task_struct *pick_next_task_idle(struct rq *rq);
#ifdef CONFIG_SMP
extern void update_group_capacity(struct sched_domain *sd, int cpu);
@ -2309,7 +2300,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
#endif /* CONFIG_CPU_FREQ */
#ifdef CONFIG_UCLAMP_TASK
enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id);
static __always_inline
unsigned int uclamp_util_with(struct rq *rq, unsigned int util,

View File

@ -29,20 +29,17 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
/* we're never preempted */
}
static void set_next_task_stop(struct rq *rq, struct task_struct *stop)
static void set_next_task_stop(struct rq *rq, struct task_struct *stop, bool first)
{
stop->se.exec_start = rq_clock_task(rq);
}
static struct task_struct *
pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
static struct task_struct *pick_next_task_stop(struct rq *rq)
{
WARN_ON_ONCE(prev || rf);
if (!sched_stop_runnable(rq))
return NULL;
set_next_task_stop(rq, rq->stop);
set_next_task_stop(rq, rq->stop, true);
return rq->stop;
}

View File

@ -1201,16 +1201,13 @@ static void set_domain_attribute(struct sched_domain *sd,
if (!attr || attr->relax_domain_level < 0) {
if (default_relax_domain_level < 0)
return;
else
request = default_relax_domain_level;
request = default_relax_domain_level;
} else
request = attr->relax_domain_level;
if (request < sd->level) {
if (sd->level > request) {
/* Turn off idle balance on this domain: */
sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
} else {
/* Turn on idle balance on this domain: */
sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
}
}

View File

@ -1119,7 +1119,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
unsigned long ticks;
if (vtime_accounting_cpu_enabled())
if (vtime_accounting_enabled_this_cpu())
return;
/*
* We stopped the tick in idle. Update process times would miss the