Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:
 "The main updates in this cycle were:

   - Group balancing enhancements and cleanups (Brendan Jackman)

   - Move CPU isolation related functionality into its separate
     kernel/sched/isolation.c file, with related 'housekeeping_*()'
     namespace and nomenclature et al. (Frederic Weisbecker)

   - Improve the interactive/cpu-intense fairness calculation (Josef
     Bacik)

   - Improve the PELT code and related cleanups (Peter Zijlstra)

   - Improve the logic of pick_next_task_fair() (Uladzislau Rezki)

   - Improve the RT IPI based balancing logic (Steven Rostedt)

   - Various micro-optimizations:

   - better !CONFIG_SCHED_DEBUG optimizations (Patrick Bellasi)

   - better idle loop (Cheng Jian)

   - ... plus misc fixes, cleanups and updates"

* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (54 commits)
  sched/core: Optimize sched_feat() for !CONFIG_SCHED_DEBUG builds
  sched/sysctl: Fix attributes of some extern declarations
  sched/isolation: Document isolcpus= boot parameter flags, mark it deprecated
  sched/isolation: Add basic isolcpus flags
  sched/isolation: Move isolcpus= handling to the housekeeping code
  sched/isolation: Handle the nohz_full= parameter
  sched/isolation: Introduce housekeeping flags
  sched/isolation: Split out new CONFIG_CPU_ISOLATION=y config from CONFIG_NO_HZ_FULL
  sched/isolation: Rename is_housekeeping_cpu() to housekeeping_cpu()
  sched/isolation: Use its own static key
  sched/isolation: Make the housekeeping cpumask private
  sched/isolation: Provide a dynamic off-case to housekeeping_any_cpu()
  sched/isolation, watchdog: Use housekeeping_cpumask() instead of ad-hoc version
  sched/isolation: Move housekeeping related code to its own file
  sched/idle: Micro-optimize the idle loop
  sched/isolcpus: Fix "isolcpus=" boot parameter handling when !CONFIG_CPUMASK_OFFSTACK
  x86/tsc: Append the 'tsc=' description for the 'tsc=unstable' boot parameter
  sched/rt: Simplify the IPI based RT balancing logic
  block/ioprio: Use a helper to check for RT prio
  sched/rt: Add a helper to test for a RT task
  ...
This commit is contained in:
Linus Torvalds 2017-11-13 13:37:52 -08:00
commit 3e2014637c
31 changed files with 1294 additions and 798 deletions

View File

@ -1730,20 +1730,33 @@
isapnp= [ISAPNP]
Format: <RDP>,<reset>,<pci_scan>,<verbosity>
isolcpus= [KNL,SMP] Isolate CPUs from the general scheduler.
The argument is a cpu list, as described above.
isolcpus= [KNL,SMP] Isolate a given set of CPUs from disturbance.
[Deprecated - use cpusets instead]
Format: [flag-list,]<cpu-list>
Specify one or more CPUs to isolate from disturbances
specified in the flag list (default: domain):
nohz
Disable the tick when a single task runs.
domain
Isolate from the general SMP balancing and scheduling
algorithms. Note that performing domain isolation this way
is irreversible: it's not possible to bring back a CPU to
the domains once isolated through isolcpus. It's strongly
advised to use cpusets instead to disable scheduler load
balancing through the "cpuset.sched_load_balance" file.
It offers a much more flexible interface where CPUs can
move in and out of an isolated set anytime.
You can move a process onto or off an "isolated" CPU via
the CPU affinity syscalls or cpuset.
<cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1".
The format of <cpu-list> is described above.
This option can be used to specify one or more CPUs
to isolate from the general SMP balancing and scheduling
algorithms. You can move a process onto or off an
"isolated" CPU via the CPU affinity syscalls or cpuset.
<cpu number> begins at 0 and the maximum value is
"number of CPUs in system - 1".
This option is the preferred way to isolate CPUs. The
alternative -- manually setting the CPU mask of all
tasks in the system -- can cause problems and
suboptimal load balancer performance.
iucv= [HW,NET]
@ -4209,6 +4222,9 @@
Used to run time disable IRQ_TIME_ACCOUNTING on any
platforms where RDTSC is slow and this accounting
can add overhead.
[x86] unstable: mark the TSC clocksource as unstable, this
marks the TSC unconditionally unstable at bootup and
avoids any further wobbles once the TSC watchdog notices.
turbografx.map[2|3]= [HW,JOY]
TurboGraFX parallel port interface

View File

@ -18,6 +18,7 @@
#include <linux/cpufeature.h>
#include <linux/tick.h>
#include <linux/pm_qos.h>
#include <linux/sched/isolation.h>
#include "base.h"
@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
struct device_attribute *attr, char *buf)
{
int n = 0, len = PAGE_SIZE-2;
cpumask_var_t isolated;
n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
return -ENOMEM;
cpumask_andnot(isolated, cpu_possible_mask,
housekeeping_cpumask(HK_FLAG_DOMAIN));
n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
free_cpumask_var(isolated);
return n;
}

View File

@ -40,7 +40,7 @@
#include <linux/tcp.h>
#include <linux/net_tstamp.h>
#include <linux/ptp_clock_kernel.h>
#include <linux/tick.h>
#include <linux/sched/isolation.h>
#include <asm/checksum.h>
#include <asm/homecache.h>
@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
tile_net_dev_init(name, mac);
if (!network_cpus_init())
cpumask_and(&network_cpus_map, housekeeping_cpumask(),
cpu_online_mask);
cpumask_and(&network_cpus_map,
housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
return 0;
}

View File

@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
static inline const char *get_task_state(struct task_struct *tsk)
{
BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
return task_state_array[__get_task_state(tsk)];
return task_state_array[task_state_index(tsk)];
}
static inline int get_task_umask(struct task_struct *tsk)

View File

@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return 0;
}
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return 0;
}
/* Valid inputs for n are -1 and 0. */
static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
/**
* cpumask_last - get the last CPU in a cpumask
* @srcp: - the cpumask pointer
*
* Returns >= nr_cpumask_bits if no CPUs set.
*/
static inline unsigned int cpumask_last(const struct cpumask *srcp)
{
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
}
unsigned int cpumask_next(int n, const struct cpumask *srcp);
/**

View File

@ -3,6 +3,7 @@
#define IOPRIO_H
#include <linux/sched.h>
#include <linux/sched/rt.h>
#include <linux/iocontext.h>
/*
@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
{
if (task->policy == SCHED_IDLE)
return IOPRIO_CLASS_IDLE;
else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
else if (task_is_realtime(task))
return IOPRIO_CLASS_RT;
else
return IOPRIO_CLASS_BE;

View File

@ -166,8 +166,6 @@ struct task_group;
/* Task command name length: */
#define TASK_COMM_LEN 16
extern cpumask_var_t cpu_isolated_map;
extern void scheduler_tick(void);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
@ -332,9 +330,11 @@ struct load_weight {
struct sched_avg {
u64 last_update_time;
u64 load_sum;
u64 runnable_load_sum;
u32 util_sum;
u32 period_contrib;
unsigned long load_avg;
unsigned long runnable_load_avg;
unsigned long util_avg;
};
@ -377,6 +377,7 @@ struct sched_statistics {
struct sched_entity {
/* For load-balancing: */
struct load_weight load;
unsigned long runnable_weight;
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
@ -472,10 +473,10 @@ struct sched_dl_entity {
* conditions between the inactive timer handler and the wakeup
* code.
*/
int dl_throttled;
int dl_boosted;
int dl_yielded;
int dl_non_contending;
int dl_throttled : 1;
int dl_boosted : 1;
int dl_yielded : 1;
int dl_non_contending : 1;
/*
* Bandwidth enforcement timer. Each -deadline task has its
@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
#define TASK_REPORT_IDLE (TASK_REPORT + 1)
#define TASK_REPORT_MAX (TASK_REPORT_IDLE << 1)
static inline unsigned int __get_task_state(struct task_struct *tsk)
static inline unsigned int task_state_index(struct task_struct *tsk)
{
unsigned int tsk_state = READ_ONCE(tsk->state);
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
return fls(state);
}
static inline char __task_state_to_char(unsigned int state)
static inline char task_index_to_char(unsigned int state)
{
static const char state_char[] = "RSDTtXZPI";
@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)
static inline char task_state_to_char(struct task_struct *tsk)
{
return __task_state_to_char(__get_task_state(tsk));
return task_index_to_char(task_state_index(tsk));
}
/**

View File

@ -0,0 +1,51 @@
#ifndef _LINUX_SCHED_ISOLATION_H
#define _LINUX_SCHED_ISOLATION_H
#include <linux/cpumask.h>
#include <linux/init.h>
#include <linux/tick.h>
enum hk_flags {
HK_FLAG_TIMER = 1,
HK_FLAG_RCU = (1 << 1),
HK_FLAG_MISC = (1 << 2),
HK_FLAG_SCHED = (1 << 3),
HK_FLAG_TICK = (1 << 4),
HK_FLAG_DOMAIN = (1 << 5),
};
#ifdef CONFIG_CPU_ISOLATION
DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
extern int housekeeping_any_cpu(enum hk_flags flags);
extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
extern void __init housekeeping_init(void);
#else
static inline int housekeeping_any_cpu(enum hk_flags flags)
{
return smp_processor_id();
}
static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{
return cpu_possible_mask;
}
static inline void housekeeping_affine(struct task_struct *t,
enum hk_flags flags) { }
static inline void housekeeping_init(void) { }
#endif /* CONFIG_CPU_ISOLATION */
static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
{
#ifdef CONFIG_CPU_ISOLATION
if (static_branch_unlikely(&housekeeping_overriden))
return housekeeping_test_cpu(cpu, flags);
#endif
return true;
}
#endif /* _LINUX_SCHED_ISOLATION_H */

View File

@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
return rt_prio(p->prio);
}
static inline bool task_is_realtime(struct task_struct *tsk)
{
int policy = tsk->policy;
if (policy == SCHED_FIFO || policy == SCHED_RR)
return true;
if (policy == SCHED_DEADLINE)
return true;
return false;
}
#ifdef CONFIG_RT_MUTEXES
/*
* Must hold either p->pi_lock or task_rq(p)->lock.

View File

@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern __read_mostly unsigned int sysctl_sched_migration_cost;
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
extern __read_mostly unsigned int sysctl_sched_time_avg;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,

View File

@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
#ifdef CONFIG_NO_HZ_FULL
extern bool tick_nohz_full_running;
extern cpumask_var_t tick_nohz_full_mask;
extern cpumask_var_t housekeeping_mask;
static inline bool tick_nohz_full_enabled(void)
{
@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
cpumask_or(mask, mask, tick_nohz_full_mask);
}
static inline int housekeeping_any_cpu(void)
{
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
}
extern void tick_nohz_dep_set(enum tick_dep_bits bit);
extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
extern void tick_nohz_full_kick_cpu(int cpu);
extern void __tick_nohz_task_switch(void);
extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
#else
static inline int housekeeping_any_cpu(void)
{
return smp_processor_id();
}
static inline bool tick_nohz_full_enabled(void) { return false; }
static inline bool tick_nohz_full_cpu(int cpu) { return false; }
static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
static inline void tick_nohz_full_kick_cpu(int cpu) { }
static inline void __tick_nohz_task_switch(void) { }
static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
#endif
static inline const struct cpumask *housekeeping_cpumask(void)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled())
return housekeeping_mask;
#endif
return cpu_possible_mask;
}
static inline bool is_housekeeping_cpu(int cpu)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled())
return cpumask_test_cpu(cpu, housekeeping_mask);
#endif
return true;
}
static inline void housekeeping_affine(struct task_struct *t)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled())
set_cpus_allowed_ptr(t, housekeeping_mask);
#endif
}
static inline void tick_nohz_task_switch(void)
{
if (tick_nohz_full_enabled())

View File

@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
if (preempt)
return TASK_STATE_MAX;
return __get_task_state(p);
return task_state_index(p);
}
#endif /* CREATE_TRACE_POINTS */

View File

@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING
endmenu # "CPU/Task time and stats accounting"
config CPU_ISOLATION
bool "CPU isolation"
help
Make sure that CPUs running critical tasks are not disturbed by
any source of "noise" such as unbound workqueues, timers, kthreads...
Unbound jobs get offloaded to housekeeping CPUs.
source "kernel/rcu/Kconfig"
config BUILD_BIN2C

View File

@ -46,6 +46,7 @@
#include <linux/cgroup.h>
#include <linux/efi.h>
#include <linux/tick.h>
#include <linux/sched/isolation.h>
#include <linux/interrupt.h>
#include <linux/taskstats_kern.h>
#include <linux/delayacct.h>
@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
early_irq_init();
init_IRQ();
tick_init();
housekeeping_init();
rcu_init_nohz();
init_timers();
hrtimers_init();

View File

@ -57,7 +57,7 @@
#include <linux/backing-dev.h>
#include <linux/sort.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
goto done;
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
non_isolated_cpus);
housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
cpumask_intersects(cp->cpus_allowed,
housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@ -789,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
cpumask_and(dp, dp, non_isolated_cpus);
cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@ -802,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*

View File

@ -29,6 +29,7 @@
#include <linux/oom.h>
#include <linux/sched/debug.h>
#include <linux/smpboot.h>
#include <linux/sched/isolation.h>
#include <uapi/linux/sched/types.h>
#include "../time/tick-internal.h"
@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)
if (!tick_nohz_full_enabled())
return;
housekeeping_affine(current);
housekeeping_affine(current, HK_FLAG_RCU);
}
/* Record the current task on dyntick-idle entry. */

View File

@ -51,6 +51,7 @@
#include <linux/kthread.h>
#include <linux/tick.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/isolation.h>
#define CREATE_TRACE_POINTS
@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
LIST_HEAD(rcu_tasks_holdouts);
/* Run on housekeeping CPUs by default. Sysadm can move if desired. */
housekeeping_affine(current);
housekeeping_affine(current, HK_FLAG_RCU);
/*
* Each pass through the following loop makes one check for

View File

@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_CPU_ISOLATION) += isolation.o

View File

@ -26,6 +26,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@ -42,18 +43,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
*
* If SCHED_DEBUG is disabled, each compilation unit has its own copy of
* sysctl_sched_features, defined in sched.h, to allow constants propagation
* at compile time and compiler optimization based on features default.
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
#endif
/*
* Number of tasks to iterate in a single balance run.
@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
/* CPUs with isolated domains */
cpumask_var_t cpu_isolated_map;
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
if (!is_housekeeping_cpu(cpu))
cpu = housekeeping_any_cpu();
if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
static void set_load_weight(struct task_struct *p)
static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
/*
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
if (update_load && p->sched_class == &fair_sched_class) {
reweight_task(p, prio);
} else {
load->weight = scale_load(sched_prio_to_weight[prio]);
load->inv_weight = sched_prio_to_wmult[prio];
}
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
set_load_weight(p);
set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
set_load_weight(p);
set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
set_load_weight(p);
set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
sched_init_numa();
/*
@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
@ -5934,7 +5935,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
set_load_weight(&init_task);
set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
@ -5953,9 +5954,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
/* May be allocated at isolcpus cmdline parse time */
if (cpu_isolated_map == NULL)
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif

View File

@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, &rq->dl);
raw_spin_lock(&dl_b->lock);
__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
}
@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
}
raw_spin_lock(&dl_b->lock);
__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
__dl_clear_params(p);
@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
} else if ((flags & ENQUEUE_RESTORE) &&
dl_time_before(dl_se->deadline,
rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
/*
* If p is boosted we already updated its params in
* rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
* p's deadline being now already after rq_clock(rq).
*/
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
setup_new_dl_entity(&p->dl);
if (rq->curr != p) {
#ifdef CONFIG_SMP
@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
__dl_clear(dl_b, p->dl.dl_bw, cpus);
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* But this would require to set the task's "inactive
* timer" when the task is not inactive.
*/
__dl_clear(dl_b, p->dl.dl_bw, cpus);
__dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;

View File

@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
P(se->avg.runnable_load_avg);
#endif
#undef PN_SCHEDSTAT
@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
cfs_rq->runnable_load_avg);
cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
atomic_long_read(&cfs_rq->removed_load_avg));
SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
atomic_long_read(&cfs_rq->removed_util_avg));
SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
cfs_rq->removed.load_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
cfs_rq->removed.util_avg);
SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -209,6 +209,7 @@ exit_idle:
*/
static void do_idle(void)
{
int cpu = smp_processor_id();
/*
* If the arch has a polling bit, we maintain an invariant:
*
@ -219,14 +220,13 @@ static void do_idle(void)
*/
__current_set_polling();
quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
check_pgt_cache();
rmb();
if (cpu_is_offline(smp_processor_id())) {
if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}

155
kernel/sched/isolation.c Normal file
View File

@ -0,0 +1,155 @@
/*
* Housekeeping management. Manage the targets for routine code that can run on
* any CPU: unbound workqueues, timers, kthreads and any offloadable work.
*
* Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
*
*/
#include <linux/sched/isolation.h>
#include <linux/tick.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/static_key.h>
#include <linux/ctype.h>
DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
EXPORT_SYMBOL_GPL(housekeeping_overriden);
static cpumask_var_t housekeeping_mask;
static unsigned int housekeeping_flags;
int housekeeping_any_cpu(enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
return cpumask_any_and(housekeeping_mask, cpu_online_mask);
return smp_processor_id();
}
EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
return housekeeping_mask;
return cpu_possible_mask;
}
EXPORT_SYMBOL_GPL(housekeeping_cpumask);
void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
set_cpus_allowed_ptr(t, housekeeping_mask);
}
EXPORT_SYMBOL_GPL(housekeeping_affine);
bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
{
if (static_branch_unlikely(&housekeeping_overriden))
if (housekeeping_flags & flags)
return cpumask_test_cpu(cpu, housekeeping_mask);
return true;
}
EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
void __init housekeeping_init(void)
{
if (!housekeeping_flags)
return;
static_branch_enable(&housekeeping_overriden);
/* We need at least one CPU to handle housekeeping work */
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
static int __init housekeeping_setup(char *str, enum hk_flags flags)
{
cpumask_var_t non_housekeeping_mask;
int err;
alloc_bootmem_cpumask_var(&non_housekeeping_mask);
err = cpulist_parse(str, non_housekeeping_mask);
if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
if (!housekeeping_flags) {
alloc_bootmem_cpumask_var(&housekeeping_mask);
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, non_housekeeping_mask);
if (cpumask_empty(housekeeping_mask))
cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
} else {
cpumask_var_t tmp;
alloc_bootmem_cpumask_var(&tmp);
cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
if (!cpumask_equal(tmp, housekeeping_mask)) {
pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
free_bootmem_cpumask_var(tmp);
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
free_bootmem_cpumask_var(tmp);
}
if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
tick_nohz_full_setup(non_housekeeping_mask);
} else {
pr_warn("Housekeeping: nohz unsupported."
" Build with CONFIG_NO_HZ_FULL\n");
free_bootmem_cpumask_var(non_housekeeping_mask);
return 0;
}
}
housekeeping_flags |= flags;
free_bootmem_cpumask_var(non_housekeeping_mask);
return 1;
}
static int __init housekeeping_nohz_full_setup(char *str)
{
unsigned int flags;
flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
return housekeeping_setup(str, flags);
}
__setup("nohz_full=", housekeeping_nohz_full_setup);
static int __init housekeeping_isolcpus_setup(char *str)
{
unsigned int flags = 0;
while (isalpha(*str)) {
if (!strncmp(str, "nohz,", 5)) {
str += 5;
flags |= HK_FLAG_TICK;
continue;
}
if (!strncmp(str, "domain,", 7)) {
str += 7;
flags |= HK_FLAG_DOMAIN;
continue;
}
pr_warn("isolcpus: Error, unknown flag\n");
return 0;
}
/* Default behaviour for isolcpus without flags */
if (!flags)
flags |= HK_FLAG_DOMAIN;
return housekeeping_setup(str, flags);
}
__setup("isolcpus=", housekeeping_isolcpus_setup);

View File

@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
static void push_irq_work_func(struct irq_work *work);
#endif
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
#ifdef HAVE_RT_PUSH_IPI
rt_rq->push_flags = 0;
rt_rq->push_cpu = nr_cpu_ids;
raw_spin_lock_init(&rt_rq->push_lock);
init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@ -1876,68 +1865,6 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
/*
* The search for the next cpu always starts at rq->cpu and ends
* when we reach rq->cpu again. It will never return rq->cpu.
* This returns the next cpu to check, or nr_cpu_ids if the loop
* is complete.
*
* rq->rt.push_cpu holds the last cpu returned by this function,
* or if this is the first instance, it must hold rq->cpu.
*/
static int rto_next_cpu(struct rq *rq)
{
int prev_cpu = rq->rt.push_cpu;
int cpu;
cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
/*
* If the previous cpu is less than the rq's CPU, then it already
* passed the end of the mask, and has started from the beginning.
* We end if the next CPU is greater or equal to rq's CPU.
*/
if (prev_cpu < rq->cpu) {
if (cpu >= rq->cpu)
return nr_cpu_ids;
} else if (cpu >= nr_cpu_ids) {
/*
* We passed the end of the mask, start at the beginning.
* If the result is greater or equal to the rq's CPU, then
* the loop is finished.
*/
cpu = cpumask_first(rq->rd->rto_mask);
if (cpu >= rq->cpu)
return nr_cpu_ids;
}
rq->rt.push_cpu = cpu;
/* Return cpu to let the caller know if the loop is finished or not */
return cpu;
}
static int find_next_push_cpu(struct rq *rq)
{
struct rq *next_rq;
int cpu;
while (1) {
cpu = rto_next_cpu(rq);
if (cpu >= nr_cpu_ids)
break;
next_rq = cpu_rq(cpu);
/* Make sure the next rq can push to this rq */
if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
break;
}
return cpu;
}
#define RT_PUSH_IPI_EXECUTING 1
#define RT_PUSH_IPI_RESTART 2
/*
* When a high priority task schedules out from a CPU and a lower priority
@ -1947,170 +1874,157 @@ static int find_next_push_cpu(struct rq *rq)
* tasks queued on it (overloaded) needs to be notified that a CPU has opened
* up that may be able to run one of its non-running queued RT tasks.
*
* On large CPU boxes, there's the case that several CPUs could schedule
* a lower priority task at the same time, in which case it will look for
* any overloaded CPUs that it could pull a task from. To do this, the runqueue
* lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
* for a single overloaded CPU's runqueue lock can produce a large latency.
* (This has actually been observed on large boxes running cyclictest).
* Instead of taking the runqueue lock of the overloaded CPU, each of the
* CPUs that scheduled a lower priority task simply sends an IPI to the
* overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
* lots of contention. The overloaded CPU will look to push its non-running
* RT task off, and if it does, it can then ignore the other IPIs coming
* in, and just pass those IPIs off to any other overloaded CPU.
* All CPUs with overloaded RT tasks need to be notified as there is currently
* no way to know which of these CPUs have the highest priority task waiting
* to run. Instead of trying to take a spinlock on each of these CPUs,
* which has shown to cause large latency when done on machines with many
* CPUs, sending an IPI to the CPUs to have them push off the overloaded
* RT tasks waiting to run.
*
* When a CPU schedules a lower priority task, it only sends an IPI to
* the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
* as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
* RT overloaded tasks, would cause 100 IPIs to go out at once.
* Just sending an IPI to each of the CPUs is also an issue, as on large
* count CPU machines, this can cause an IPI storm on a CPU, especially
* if its the only CPU with multiple RT tasks queued, and a large number
* of CPUs scheduling a lower priority task at the same time.
*
* The overloaded RT CPU, when receiving an IPI, will try to push off its
* overloaded RT tasks and then send an IPI to the next CPU that has
* overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
* have completed. Just because a CPU may have pushed off its own overloaded
* RT task does not mean it should stop sending the IPI around to other
* overloaded CPUs. There may be another RT task waiting to run on one of
* those CPUs that are of higher priority than the one that was just
* pushed.
* Each root domain has its own irq work function that can iterate over
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
* tassk must be checked if there's one or many CPUs that are lowering
* their priority, there's a single irq work iterator that will try to
* push off RT tasks that are waiting to run.
*
* An optimization that could possibly be made is to make a CPU array similar
* to the cpupri array mask of all running RT tasks, but for the overloaded
* case, then the IPI could be sent to only the CPU with the highest priority
* RT task waiting, and that CPU could send off further IPIs to the CPU with
* the next highest waiting task. Since the overloaded case is much less likely
* to happen, the complexity of this implementation may not be worth it.
* Instead, just send an IPI around to all overloaded CPUs.
* When a CPU schedules a lower priority task, it will kick off the
* irq work iterator that will jump to each CPU with overloaded RT tasks.
* As it only takes the first CPU that schedules a lower priority task
* to start the process, the rto_start variable is incremented and if
* the atomic result is one, then that CPU will try to take the rto_lock.
* This prevents high contention on the lock as the process handles all
* CPUs scheduling lower priority tasks.
*
* The rq->rt.push_flags holds the status of the IPI that is going around.
* A run queue can only send out a single IPI at a time. The possible flags
* for rq->rt.push_flags are:
* All CPUs that are scheduling a lower priority task will increment the
* rt_loop_next variable. This will make sure that the irq work iterator
* checks all RT overloaded CPUs whenever a CPU schedules a new lower
* priority task, even if the iterator is in the middle of a scan. Incrementing
* the rt_loop_next will cause the iterator to perform another scan.
*
* (None or zero): No IPI is going around for the current rq
* RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around
* RT_PUSH_IPI_RESTART: The priority of the running task for the rq
* has changed, and the IPI should restart
* circulating the overloaded CPUs again.
*
* rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
* before sending to the next CPU.
*
* Instead of having all CPUs that schedule a lower priority task send
* an IPI to the same "first" CPU in the RT overload mask, they send it
* to the next overloaded CPU after their own CPU. This helps distribute
* the work when there's more than one overloaded CPU and multiple CPUs
* scheduling in lower priority tasks.
*
* When a rq schedules a lower priority task than what was currently
* running, the next CPU with overloaded RT tasks is examined first.
* That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
* priority task, it will send an IPI first to CPU 5, then CPU 5 will
* send to CPU 1 if it is still overloaded. CPU 1 will clear the
* rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
*
* The first CPU to notice IPI_RESTART is set, will clear that flag and then
* send an IPI to the next overloaded CPU after the rq->cpu and not the next
* CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
* schedules a lower priority task, and the IPI_RESTART gets set while the
* handling is being done on CPU 5, it will clear the flag and send it back to
* CPU 4 instead of CPU 1.
*
* Note, the above logic can be disabled by turning off the sched_feature
* RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
* taken by the CPU requesting a pull and the waiting RT task will be pulled
* by that CPU. This may be fine for machines with few CPUs.
*/
static void tell_cpu_to_push(struct rq *rq)
static int rto_next_cpu(struct rq *rq)
{
struct root_domain *rd = rq->rd;
int next;
int cpu;
if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
raw_spin_lock(&rq->rt.push_lock);
/* Make sure it's still executing */
if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
/*
* Tell the IPI to restart the loop as things have
* changed since it started.
*/
rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
raw_spin_unlock(&rq->rt.push_lock);
return;
}
raw_spin_unlock(&rq->rt.push_lock);
/*
* When starting the IPI RT pushing, the rto_cpu is set to -1,
* rt_next_cpu() will simply return the first CPU found in
* the rto_mask.
*
* If rto_next_cpu() is called with rto_cpu is a valid cpu, it
* will return the next CPU found in the rto_mask.
*
* If there are no more CPUs left in the rto_mask, then a check is made
* against rto_loop and rto_loop_next. rto_loop is only updated with
* the rto_lock held, but any CPU may increment the rto_loop_next
* without any locking.
*/
for (;;) {
/* When rto_cpu is -1 this acts like cpumask_first() */
cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
rd->rto_cpu = cpu;
if (cpu < nr_cpu_ids)
return cpu;
rd->rto_cpu = -1;
/*
* ACQUIRE ensures we see the @rto_mask changes
* made prior to the @next value observed.
*
* Matches WMB in rt_set_overload().
*/
next = atomic_read_acquire(&rd->rto_loop_next);
if (rd->rto_loop == next)
break;
rd->rto_loop = next;
}
/* When here, there's no IPI going around */
return -1;
}
rq->rt.push_cpu = rq->cpu;
cpu = find_next_push_cpu(rq);
if (cpu >= nr_cpu_ids)
static inline bool rto_start_trylock(atomic_t *v)
{
return !atomic_cmpxchg_acquire(v, 0, 1);
}
static inline void rto_start_unlock(atomic_t *v)
{
atomic_set_release(v, 0);
}
static void tell_cpu_to_push(struct rq *rq)
{
int cpu = -1;
/* Keep the loop going if the IPI is currently active */
atomic_inc(&rq->rd->rto_loop_next);
/* Only one CPU can initiate a loop at a time */
if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
raw_spin_lock(&rq->rd->rto_lock);
irq_work_queue_on(&rq->rt.push_work, cpu);
/*
* The rto_cpu is updated under the lock, if it has a valid cpu
* then the IPI is still running and will continue due to the
* update to loop_next, and nothing needs to be done here.
* Otherwise it is finishing up and an ipi needs to be sent.
*/
if (rq->rd->rto_cpu < 0)
cpu = rto_next_cpu(rq);
raw_spin_unlock(&rq->rd->rto_lock);
rto_start_unlock(&rq->rd->rto_loop_start);
if (cpu >= 0)
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
/* Called from hardirq context */
static void try_to_push_tasks(void *arg)
void rto_push_irq_work_func(struct irq_work *work)
{
struct rt_rq *rt_rq = arg;
struct rq *rq, *src_rq;
int this_cpu;
struct rq *rq;
int cpu;
this_cpu = rt_rq->push_cpu;
rq = this_rq();
/* Paranoid check */
BUG_ON(this_cpu != smp_processor_id());
rq = cpu_rq(this_cpu);
src_rq = rq_of_rt_rq(rt_rq);
again:
/*
* We do not need to grab the lock to check for has_pushable_tasks.
* When it gets updated, a check is made if a push is possible.
*/
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
push_rt_task(rq);
push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
raw_spin_lock(&rq->rd->rto_lock);
/* Pass the IPI to the next rt overloaded queue */
raw_spin_lock(&rt_rq->push_lock);
/*
* If the source queue changed since the IPI went out,
* we need to restart the search from that CPU again.
*/
if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
rt_rq->push_cpu = src_rq->cpu;
}
cpu = rto_next_cpu(rq);
cpu = find_next_push_cpu(src_rq);
raw_spin_unlock(&rq->rd->rto_lock);
if (cpu >= nr_cpu_ids)
rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
raw_spin_unlock(&rt_rq->push_lock);
if (cpu >= nr_cpu_ids)
if (cpu < 0)
return;
/*
* It is possible that a restart caused this CPU to be
* chosen again. Don't bother with an IPI, just see if we
* have more to push.
*/
if (unlikely(cpu == rq->cpu))
goto again;
/* Try the next RT overloaded CPU */
irq_work_queue_on(&rt_rq->push_work, cpu);
}
static void push_irq_work_func(struct irq_work *work)
{
struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
try_to_push_tasks(rt_rq);
irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */

View File

@ -227,7 +227,7 @@ struct dl_bw {
static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
static inline
void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
{
dl_b->total_bw -= tsk_bw;
__dl_update(dl_b, (s32)tsk_bw / cpus);
@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern void __dl_clear_params(struct task_struct *p);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_task_can_attach(struct task_struct *p,
const struct cpumask *cs_cpus_allowed);
@ -419,6 +418,7 @@ struct cfs_bandwidth { };
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
unsigned long runnable_weight;
unsigned int nr_running, h_nr_running;
u64 exec_clock;
@ -444,18 +444,22 @@ struct cfs_rq {
* CFS load tracking
*/
struct sched_avg avg;
u64 runnable_load_sum;
unsigned long runnable_load_avg;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
unsigned long propagate_avg;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
struct {
raw_spinlock_t lock ____cacheline_aligned;
int nr;
unsigned long load_avg;
unsigned long util_avg;
unsigned long runnable_sum;
} removed;
#ifdef CONFIG_FAIR_GROUP_SCHED
unsigned long tg_load_avg_contrib;
long propagate;
long prop_runnable_sum;
/*
* h_load = weight * f(tg)
*
@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
#ifdef CONFIG_IRQ_WORK
#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
@ -524,12 +528,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#ifdef HAVE_RT_PUSH_IPI
int push_flags;
int push_cpu;
struct irq_work push_work;
raw_spinlock_t push_lock;
#endif
#endif /* CONFIG_SMP */
int rt_queued;
@ -638,6 +636,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
*/
struct irq_work rto_push_work;
raw_spinlock_t rto_lock;
/* These are only updated and read within rto_lock */
int rto_loop;
int rto_cpu;
/* These atomics are updated outside of a lock */
atomic_t rto_loop_next;
atomic_t rto_loop_start;
#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
#ifdef HAVE_RT_PUSH_IPI
extern void rto_push_irq_work_func(struct irq_work *work);
#endif
#endif /* CONFIG_SMP */
/*
@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
# define const_debug const
#endif
extern const_debug unsigned int sysctl_sched_features;
#define SCHED_FEAT(name, enabled) \
__SCHED_FEAT_##name ,
@ -1232,6 +1244,13 @@ enum {
#undef SCHED_FEAT
#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* To support run-time toggling of sched features, all the translation units
* (but core.c) reference the sysctl_sched_features defined in core.c.
*/
extern const_debug unsigned int sysctl_sched_features;
#define SCHED_FEAT(name, enabled) \
static __always_inline bool static_branch_##name(struct static_key *key) \
{ \
@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
}
#include "features.h"
#undef SCHED_FEAT
extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
/*
* Each translation unit has its own copy of sysctl_sched_features to allow
* constants propagation at compile time and compiler optimization based on
* features default.
*/
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
static const_debug __maybe_unused unsigned int sysctl_sched_features =
#include "features.h"
0;
#undef SCHED_FEAT
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
extern void init_sched_rt_class(void);
extern void init_sched_fair_class(void);
extern void reweight_task(struct task_struct *p, int prio);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);

View File

@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/mutex.h>
#include <linux/sched/isolation.h>
#include "sched.h"
@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
#ifdef HAVE_RT_PUSH_IPI
rd->rto_cpu = -1;
raw_spin_lock_init(&rd->rto_lock);
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
#endif
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;
@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
update_top_cache_domain(cpu);
}
/* Setup the mask of CPUs configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
int ret;
alloc_bootmem_cpumask_var(&cpu_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
return 0;
}
return 1;
}
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
struct sched_domain ** __percpu sd;
struct root_domain *rd;
@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
sd->smt_gain = 1178; /* ~15% */
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
sd->flags |= SD_PREFER_SIBLING;
sd->imbalance_pct = 117;
sd->cache_nice_tries = 1;
sd->busy_idx = 2;
@ -1332,6 +1325,10 @@ void sched_init_numa(void)
if (!sched_domains_numa_distance)
return;
/* Includes NUMA identity node at level 0. */
sched_domains_numa_distance[level++] = curr_distance;
sched_domains_numa_levels = level;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
@ -1379,8 +1376,7 @@ void sched_init_numa(void)
return;
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
* 'level' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@ -1441,10 +1437,19 @@ void sched_init_numa(void)
for (i = 0; sched_domain_topology[i].mask; i++)
tl[i] = sched_domain_topology[i];
/*
* Add the NUMA identity distance, aka single NODE.
*/
tl[i++] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.numa_level = 0,
SD_INIT_NAME(NODE)
};
/*
* .. and append 'j' levels of NUMA goodness.
*/
for (j = 0; j < level; i++, j++) {
for (j = 1; j < level; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
doms_new = alloc_sched_domains(1);
if (doms_new) {
n = 1;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
cpumask_and(doms_new[0], cpu_active_mask,
housekeeping_cpumask(HK_FLAG_DOMAIN));
}
} else {
n = ndoms_new;
@ -1880,7 +1886,8 @@ match1:
if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
cpumask_and(doms_new[0], cpu_active_mask,
housekeeping_cpumask(HK_FLAG_DOMAIN));
}
/* Build new domains: */

View File

@ -27,6 +27,7 @@
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/context_tracking.h>
#include <linux/mm.h>
#include <asm/irq_regs.h>
@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static atomic_t tick_dep_mask;
@ -385,20 +385,13 @@ out:
local_irq_restore(flags);
}
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
/* Get the boot-time nohz CPU list from the kernel parameters. */
void __init tick_nohz_full_setup(cpumask_var_t cpumask)
{
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
cpumask_copy(tick_nohz_full_mask, cpumask);
tick_nohz_full_running = true;
return 1;
}
__setup("nohz_full=", tick_nohz_full_setup);
static int tick_nohz_cpu_down(unsigned int cpu)
{
@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
return;
}
if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
cpumask_clear(tick_nohz_full_mask);
tick_nohz_full_running = false;
return;
}
/*
* Full dynticks uses irq work to drive the tick rescheduling on safe
* locking contexts. But then we need irq work to raise its own
@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
if (!arch_irq_work_has_interrupt()) {
pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
cpumask_clear(tick_nohz_full_mask);
cpumask_copy(housekeeping_mask, cpu_possible_mask);
tick_nohz_full_running = false;
return;
}
@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
cpumask_clear_cpu(cpu, tick_nohz_full_mask);
}
cpumask_andnot(housekeeping_mask,
cpu_possible_mask, tick_nohz_full_mask);
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
WARN_ON(ret < 0);
pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
cpumask_pr_args(tick_nohz_full_mask));
/*
* We need at least one CPU to handle housekeeping work such
* as timekeeping, unbound timers, workqueues, ...
*/
WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
}
#endif
@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
if (!ts->tick_stopped) {
calc_load_nohz_start();
cpu_load_update_nohz_start();
quiet_vmstat();
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;

View File

@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
trace_assign_type(field, iter->ent);
T = __task_state_to_char(field->next_state);
S = __task_state_to_char(field->prev_state);
T = task_index_to_char(field->next_state);
S = task_index_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
S = __task_state_to_char(field->prev_state);
T = __task_state_to_char(field->next_state);
S = task_index_to_char(field->prev_state);
T = task_index_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);
if (!S)
S = __task_state_to_char(field->prev_state);
T = __task_state_to_char(field->next_state);
S = task_index_to_char(field->prev_state);
T = task_index_to_char(field->next_state);
SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);

View File

@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
entry->prev_state = __get_task_state(prev);
entry->prev_state = task_state_index(prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
entry->next_state = __get_task_state(next);
entry->next_state = task_state_index(next);
entry->next_cpu = task_cpu(next);
if (!call_filter_check_discard(call, entry, buffer, event))
@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
entry->prev_state = __get_task_state(curr);
entry->prev_state = task_state_index(curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
entry->next_state = __get_task_state(wakee);
entry->next_state = task_state_index(wakee);
entry->next_cpu = task_cpu(wakee);
if (!call_filter_check_discard(call, entry, buffer, event))

View File

@ -25,6 +25,7 @@
#include <linux/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
void __init lockup_detector_init(void)
{
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
if (tick_nohz_full_enabled())
pr_info("Disabling watchdog on nohz_full cores by default\n");
cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#endif
cpumask_copy(&watchdog_cpumask,
housekeeping_cpumask(HK_FLAG_TIMER));
if (!watchdog_nmi_probe())
nmi_watchdog_available = true;