Scheduler updates for this cycle are:
- Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and debugfs interfaces to a unified debugfs interface. - Signals: Allow caching one sigqueue object per task, to improve performance & latencies. - Improve newidle_balance() irq-off latencies on systems with a large number of CPU cgroups. - Improve energy-aware scheduling - Improve the PELT metrics for certain workloads - Reintroduce select_idle_smt() to improve load-balancing locality - but without the previous regressions - Add 'scheduler latency debugging': warn after long periods of pending need_resched. This is an opt-in feature that requires the enabling of the LATENCY_WARN scheduler feature, or the use of the resched_latency_warn_ms=xx boot parameter. - CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix remaining balance_push() vs. hotplug holes/races - PSI fixes, plus allow /proc/pressure/ files to be written by CAP_SYS_RESOURCE tasks as well - Fix/improve various load-balancing corner cases vs. capacity margins - Fix sched topology on systems with NUMA diameter of 3 or above - Fix PF_KTHREAD vs to_kthread() race - Minor rseq optimizations - Misc cleanups, optimizations, fixes and smaller updates Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmCJInsRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1i5XxAArh0b+fwXlkVGzTUly7HQjhU7lFbChnmF h6ToyNLi6pXoZ14VC/WoRIME+RzK3gmw9cEFaSLVPxbkbekTcyWS78kqmcg1/j2v kO/20QhXobiIxVskYfoMmqSavZ5mKhMWBqtFXkCuYfxwGylas0VVdh3AZLJ7N21G WEoFh99pVULwWnPHxM2ZQ87Ex9BkGKbsBTswxWpprCfXLqD0N2hHlABpwJP78zRf VniWFOcC7lslILCFawb7CqGgAwbgV85nDRS4QCuCKisrkFywvjJrEeu/W+h1NfhF d6ves/osNdEAM1DSALoxwEA42An8l8xh8NyJnl8JZV00LW0DM108O5/7pf5Zcryc RHV3RxA7skgezBh5uThvo60QzNK+kVMatI4qpQEHxLE52CaDl/fBu1Cgb/VUxnIl AEBfyiFbk+skHpuMFKtl30Tx3M+yJKMTzFPd4kYjHYGEDwtAcXcB3dJQW48A79i3 H3IWcDcXpk5Rjo2UZmaXdt/qlj7mP6U0xdOUq8ZK6JOC4uY9skszVGsfuNN9QQ5u 2E2YKKVrGFoQydl4C8R6A7axL2VzIJszHFZNipd8E3YOyW7PWRAkr02tOOkBTj8N dLMcNM7aPJWqEYiEIjEzGQN20pweJ1dRA29LDuOswKh+7W2bWTQFh6F2Q8Haansc RVg5PDzl+Mc= =E7mz -----END PGP SIGNATURE----- Merge tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull scheduler updates from Ingo Molnar: - Clean up SCHED_DEBUG: move the decades old mess of sysctl, procfs and debugfs interfaces to a unified debugfs interface. - Signals: Allow caching one sigqueue object per task, to improve performance & latencies. - Improve newidle_balance() irq-off latencies on systems with a large number of CPU cgroups. - Improve energy-aware scheduling - Improve the PELT metrics for certain workloads - Reintroduce select_idle_smt() to improve load-balancing locality - but without the previous regressions - Add 'scheduler latency debugging': warn after long periods of pending need_resched. This is an opt-in feature that requires the enabling of the LATENCY_WARN scheduler feature, or the use of the resched_latency_warn_ms=xx boot parameter. - CPU hotplug fixes for HP-rollback, and for the 'fail' interface. Fix remaining balance_push() vs. hotplug holes/races - PSI fixes, plus allow /proc/pressure/ files to be written by CAP_SYS_RESOURCE tasks as well - Fix/improve various load-balancing corner cases vs. capacity margins - Fix sched topology on systems with NUMA diameter of 3 or above - Fix PF_KTHREAD vs to_kthread() race - Minor rseq optimizations - Misc cleanups, optimizations, fixes and smaller updates * tag 'sched-core-2021-04-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (61 commits) cpumask/hotplug: Fix cpu_dying() state tracking kthread: Fix PF_KTHREAD vs to_kthread() race sched/debug: Fix cgroup_path[] serialization sched,psi: Handle potential task count underflow bugs more gracefully sched: Warn on long periods of pending need_resched sched/fair: Move update_nohz_stats() to the CONFIG_NO_HZ_COMMON block to simplify the code & fix an unused function warning sched/debug: Rename the sched_debug parameter to sched_verbose sched,fair: Alternative sched_slice() sched: Move /proc/sched_debug to debugfs sched,debug: Convert sysctl sched_domains to debugfs debugfs: Implement debugfs_create_str() sched,preempt: Move preempt_dynamic to debug.c sched: Move SCHED_DEBUG sysctl to debugfs sched: Don't make LATENCYTOP select SCHED_DEBUG sched: Remove sched_schedstats sysctl out from under SCHED_DEBUG sched/numa: Allow runtime enabling/disabling of NUMA balance without SCHED_DEBUG sched: Use cpu_dying() to fix balance_push vs hotplug-rollback cpumask: Introduce DYING mask cpumask: Make cpu_{online,possible,present,active}() inline rseq: Optimise rseq_get_rseq_cs() and clear_rseq_cs() ...
This commit is contained in:
commit
16b3d0cf5b
|
@ -4754,7 +4754,7 @@
|
|||
|
||||
sbni= [NET] Granch SBNI12 leased line adapter
|
||||
|
||||
sched_debug [KNL] Enables verbose scheduler debug messages.
|
||||
sched_verbose [KNL] Enables verbose scheduler debug messages.
|
||||
|
||||
schedstats= [KNL,X86] Enable or disable scheduled statistics.
|
||||
Allowed values are enable and disable. This feature
|
||||
|
|
|
@ -74,8 +74,8 @@ for a given topology level by creating a sched_domain_topology_level array and
|
|||
calling set_sched_topology() with this array as the parameter.
|
||||
|
||||
The sched-domains debugging infrastructure can be enabled by enabling
|
||||
CONFIG_SCHED_DEBUG and adding 'sched_debug' to your cmdline. If you forgot to
|
||||
tweak your cmdline, you can also flip the /sys/kernel/debug/sched_debug
|
||||
knob. This enables an error checking parse of the sched domains which should
|
||||
catch most possible errors (described above). It also prints out the domain
|
||||
structure in a visual format.
|
||||
CONFIG_SCHED_DEBUG and adding 'sched_debug_verbose' to your cmdline. If you
|
||||
forgot to tweak your cmdline, you can also flip the
|
||||
/sys/kernel/debug/sched/verbose knob. This enables an error checking parse of
|
||||
the sched domains which should catch most possible errors (described above). It
|
||||
also prints out the domain structure in a visual format.
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <linux/usb.h>
|
||||
#include <linux/wait.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/kcov.h>
|
||||
#include <uapi/linux/usbip.h>
|
||||
|
||||
#undef pr_fmt
|
||||
|
|
|
@ -864,6 +864,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode,
|
|||
}
|
||||
EXPORT_SYMBOL_GPL(debugfs_create_bool);
|
||||
|
||||
ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct dentry *dentry = F_DENTRY(file);
|
||||
char *str, *copy = NULL;
|
||||
int copy_len, len;
|
||||
ssize_t ret;
|
||||
|
||||
ret = debugfs_file_get(dentry);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
||||
str = *(char **)file->private_data;
|
||||
len = strlen(str) + 1;
|
||||
copy = kmalloc(len, GFP_KERNEL);
|
||||
if (!copy) {
|
||||
debugfs_file_put(dentry);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
copy_len = strscpy(copy, str, len);
|
||||
debugfs_file_put(dentry);
|
||||
if (copy_len < 0) {
|
||||
kfree(copy);
|
||||
return copy_len;
|
||||
}
|
||||
|
||||
copy[copy_len] = '\n';
|
||||
|
||||
ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len);
|
||||
kfree(copy);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
/* This is really only for read-only strings */
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static const struct file_operations fops_str = {
|
||||
.read = debugfs_read_file_str,
|
||||
.write = debugfs_write_file_str,
|
||||
.open = simple_open,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
static const struct file_operations fops_str_ro = {
|
||||
.read = debugfs_read_file_str,
|
||||
.open = simple_open,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
static const struct file_operations fops_str_wo = {
|
||||
.write = debugfs_write_file_str,
|
||||
.open = simple_open,
|
||||
.llseek = default_llseek,
|
||||
};
|
||||
|
||||
/**
|
||||
* debugfs_create_str - create a debugfs file that is used to read and write a string value
|
||||
* @name: a pointer to a string containing the name of the file to create.
|
||||
* @mode: the permission that the file should have
|
||||
* @parent: a pointer to the parent dentry for this file. This should be a
|
||||
* directory dentry if set. If this parameter is %NULL, then the
|
||||
* file will be created in the root of the debugfs filesystem.
|
||||
* @value: a pointer to the variable that the file should read to and write
|
||||
* from.
|
||||
*
|
||||
* This function creates a file in debugfs with the given name that
|
||||
* contains the value of the variable @value. If the @mode variable is so
|
||||
* set, it can be read from, and written to.
|
||||
*
|
||||
* This function will return a pointer to a dentry if it succeeds. This
|
||||
* pointer must be passed to the debugfs_remove() function when the file is
|
||||
* to be removed (no automatic cleanup happens if your module is unloaded,
|
||||
* you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be
|
||||
* returned.
|
||||
*
|
||||
* If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will
|
||||
* be returned.
|
||||
*/
|
||||
void debugfs_create_str(const char *name, umode_t mode,
|
||||
struct dentry *parent, char **value)
|
||||
{
|
||||
debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str,
|
||||
&fops_str_ro, &fops_str_wo);
|
||||
}
|
||||
|
||||
static ssize_t read_file_blob(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
|
|
|
@ -91,44 +91,15 @@ extern struct cpumask __cpu_possible_mask;
|
|||
extern struct cpumask __cpu_online_mask;
|
||||
extern struct cpumask __cpu_present_mask;
|
||||
extern struct cpumask __cpu_active_mask;
|
||||
extern struct cpumask __cpu_dying_mask;
|
||||
#define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
|
||||
#define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask)
|
||||
#define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask)
|
||||
#define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask)
|
||||
#define cpu_dying_mask ((const struct cpumask *)&__cpu_dying_mask)
|
||||
|
||||
extern atomic_t __num_online_cpus;
|
||||
|
||||
#if NR_CPUS > 1
|
||||
/**
|
||||
* num_online_cpus() - Read the number of online CPUs
|
||||
*
|
||||
* Despite the fact that __num_online_cpus is of type atomic_t, this
|
||||
* interface gives only a momentary snapshot and is not protected against
|
||||
* concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
|
||||
* region.
|
||||
*/
|
||||
static inline unsigned int num_online_cpus(void)
|
||||
{
|
||||
return atomic_read(&__num_online_cpus);
|
||||
}
|
||||
#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
|
||||
#define num_present_cpus() cpumask_weight(cpu_present_mask)
|
||||
#define num_active_cpus() cpumask_weight(cpu_active_mask)
|
||||
#define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask)
|
||||
#define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask)
|
||||
#define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask)
|
||||
#define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask)
|
||||
#else
|
||||
#define num_online_cpus() 1U
|
||||
#define num_possible_cpus() 1U
|
||||
#define num_present_cpus() 1U
|
||||
#define num_active_cpus() 1U
|
||||
#define cpu_online(cpu) ((cpu) == 0)
|
||||
#define cpu_possible(cpu) ((cpu) == 0)
|
||||
#define cpu_present(cpu) ((cpu) == 0)
|
||||
#define cpu_active(cpu) ((cpu) == 0)
|
||||
#endif
|
||||
|
||||
extern cpumask_t cpus_booted_once_mask;
|
||||
|
||||
static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
|
||||
|
@ -857,6 +828,14 @@ set_cpu_active(unsigned int cpu, bool active)
|
|||
cpumask_clear_cpu(cpu, &__cpu_active_mask);
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_cpu_dying(unsigned int cpu, bool dying)
|
||||
{
|
||||
if (dying)
|
||||
cpumask_set_cpu(cpu, &__cpu_dying_mask);
|
||||
else
|
||||
cpumask_clear_cpu(cpu, &__cpu_dying_mask);
|
||||
}
|
||||
|
||||
/**
|
||||
* to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
|
||||
|
@ -894,6 +873,82 @@ static inline const struct cpumask *get_cpu_mask(unsigned int cpu)
|
|||
return to_cpumask(p);
|
||||
}
|
||||
|
||||
#if NR_CPUS > 1
|
||||
/**
|
||||
* num_online_cpus() - Read the number of online CPUs
|
||||
*
|
||||
* Despite the fact that __num_online_cpus is of type atomic_t, this
|
||||
* interface gives only a momentary snapshot and is not protected against
|
||||
* concurrent CPU hotplug operations unless invoked from a cpuhp_lock held
|
||||
* region.
|
||||
*/
|
||||
static inline unsigned int num_online_cpus(void)
|
||||
{
|
||||
return atomic_read(&__num_online_cpus);
|
||||
}
|
||||
#define num_possible_cpus() cpumask_weight(cpu_possible_mask)
|
||||
#define num_present_cpus() cpumask_weight(cpu_present_mask)
|
||||
#define num_active_cpus() cpumask_weight(cpu_active_mask)
|
||||
|
||||
static inline bool cpu_online(unsigned int cpu)
|
||||
{
|
||||
return cpumask_test_cpu(cpu, cpu_online_mask);
|
||||
}
|
||||
|
||||
static inline bool cpu_possible(unsigned int cpu)
|
||||
{
|
||||
return cpumask_test_cpu(cpu, cpu_possible_mask);
|
||||
}
|
||||
|
||||
static inline bool cpu_present(unsigned int cpu)
|
||||
{
|
||||
return cpumask_test_cpu(cpu, cpu_present_mask);
|
||||
}
|
||||
|
||||
static inline bool cpu_active(unsigned int cpu)
|
||||
{
|
||||
return cpumask_test_cpu(cpu, cpu_active_mask);
|
||||
}
|
||||
|
||||
static inline bool cpu_dying(unsigned int cpu)
|
||||
{
|
||||
return cpumask_test_cpu(cpu, cpu_dying_mask);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define num_online_cpus() 1U
|
||||
#define num_possible_cpus() 1U
|
||||
#define num_present_cpus() 1U
|
||||
#define num_active_cpus() 1U
|
||||
|
||||
static inline bool cpu_online(unsigned int cpu)
|
||||
{
|
||||
return cpu == 0;
|
||||
}
|
||||
|
||||
static inline bool cpu_possible(unsigned int cpu)
|
||||
{
|
||||
return cpu == 0;
|
||||
}
|
||||
|
||||
static inline bool cpu_present(unsigned int cpu)
|
||||
{
|
||||
return cpu == 0;
|
||||
}
|
||||
|
||||
static inline bool cpu_active(unsigned int cpu)
|
||||
{
|
||||
return cpu == 0;
|
||||
}
|
||||
|
||||
static inline bool cpu_dying(unsigned int cpu)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif /* NR_CPUS > 1 */
|
||||
|
||||
#define cpu_is_offline(cpu) unlikely(!cpu_online(cpu))
|
||||
|
||||
#if NR_CPUS <= BITS_PER_LONG
|
||||
|
|
|
@ -128,6 +128,8 @@ void debugfs_create_atomic_t(const char *name, umode_t mode,
|
|||
struct dentry *parent, atomic_t *value);
|
||||
struct dentry *debugfs_create_bool(const char *name, umode_t mode,
|
||||
struct dentry *parent, bool *value);
|
||||
void debugfs_create_str(const char *name, umode_t mode,
|
||||
struct dentry *parent, char **value);
|
||||
|
||||
struct dentry *debugfs_create_blob(const char *name, umode_t mode,
|
||||
struct dentry *parent,
|
||||
|
@ -156,6 +158,9 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf,
|
|||
ssize_t debugfs_write_file_bool(struct file *file, const char __user *user_buf,
|
||||
size_t count, loff_t *ppos);
|
||||
|
||||
ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf,
|
||||
size_t count, loff_t *ppos);
|
||||
|
||||
#else
|
||||
|
||||
#include <linux/err.h>
|
||||
|
@ -297,6 +302,11 @@ static inline struct dentry *debugfs_create_bool(const char *name, umode_t mode,
|
|||
return ERR_PTR(-ENODEV);
|
||||
}
|
||||
|
||||
static inline void debugfs_create_str(const char *name, umode_t mode,
|
||||
struct dentry *parent,
|
||||
char **value)
|
||||
{ }
|
||||
|
||||
static inline struct dentry *debugfs_create_blob(const char *name, umode_t mode,
|
||||
struct dentry *parent,
|
||||
struct debugfs_blob_wrapper *blob)
|
||||
|
@ -348,6 +358,13 @@ static inline ssize_t debugfs_write_file_bool(struct file *file,
|
|||
return -ENODEV;
|
||||
}
|
||||
|
||||
static inline ssize_t debugfs_read_file_str(struct file *file,
|
||||
char __user *user_buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#ifndef _LINUX_KCOV_H
|
||||
#define _LINUX_KCOV_H
|
||||
|
||||
#include <linux/sched.h>
|
||||
#include <uapi/linux/kcov.h>
|
||||
|
||||
struct task_struct;
|
||||
|
|
|
@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
|
|||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep);
|
||||
|
||||
void psi_memstall_tick(struct task_struct *task, int cpu);
|
||||
void psi_memstall_enter(unsigned long *flags);
|
||||
void psi_memstall_leave(unsigned long *flags);
|
||||
|
||||
|
|
|
@ -50,9 +50,10 @@ enum psi_states {
|
|||
PSI_MEM_SOME,
|
||||
PSI_MEM_FULL,
|
||||
PSI_CPU_SOME,
|
||||
PSI_CPU_FULL,
|
||||
/* Only per-CPU, to weigh the CPU in the global average: */
|
||||
PSI_NONIDLE,
|
||||
NR_PSI_STATES = 6,
|
||||
NR_PSI_STATES = 7,
|
||||
};
|
||||
|
||||
enum psi_aggregators {
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
#include <linux/pid.h>
|
||||
#include <linux/sem.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/kcov.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/plist.h>
|
||||
#include <linux/hrtimer.h>
|
||||
|
@ -985,6 +984,7 @@ struct task_struct {
|
|||
/* Signal handlers: */
|
||||
struct signal_struct *signal;
|
||||
struct sighand_struct __rcu *sighand;
|
||||
struct sigqueue *sigqueue_cache;
|
||||
sigset_t blocked;
|
||||
sigset_t real_blocked;
|
||||
/* Restored if set_restore_sigmask() was used: */
|
||||
|
@ -1101,7 +1101,7 @@ struct task_struct {
|
|||
#ifdef CONFIG_CPUSETS
|
||||
/* Protected by ->alloc_lock: */
|
||||
nodemask_t mems_allowed;
|
||||
/* Seqence number to catch updates: */
|
||||
/* Sequence number to catch updates: */
|
||||
seqcount_spinlock_t mems_allowed_seq;
|
||||
int cpuset_mem_spread_rotor;
|
||||
int cpuset_slab_spread_rotor;
|
||||
|
|
|
@ -26,10 +26,11 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
|||
enum { sysctl_hung_task_timeout_secs = 0 };
|
||||
#endif
|
||||
|
||||
extern unsigned int sysctl_sched_child_runs_first;
|
||||
|
||||
extern unsigned int sysctl_sched_latency;
|
||||
extern unsigned int sysctl_sched_min_granularity;
|
||||
extern unsigned int sysctl_sched_wakeup_granularity;
|
||||
extern unsigned int sysctl_sched_child_runs_first;
|
||||
|
||||
enum sched_tunable_scaling {
|
||||
SCHED_TUNABLESCALING_NONE,
|
||||
|
@ -37,7 +38,7 @@ enum sched_tunable_scaling {
|
|||
SCHED_TUNABLESCALING_LINEAR,
|
||||
SCHED_TUNABLESCALING_END,
|
||||
};
|
||||
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
|
||||
extern unsigned int sysctl_sched_tunable_scaling;
|
||||
|
||||
extern unsigned int sysctl_numa_balancing_scan_delay;
|
||||
extern unsigned int sysctl_numa_balancing_scan_period_min;
|
||||
|
@ -48,8 +49,8 @@ extern unsigned int sysctl_numa_balancing_scan_size;
|
|||
extern __read_mostly unsigned int sysctl_sched_migration_cost;
|
||||
extern __read_mostly unsigned int sysctl_sched_nr_migrate;
|
||||
|
||||
int sched_proc_update_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *length, loff_t *ppos);
|
||||
extern int sysctl_resched_latency_warn_ms;
|
||||
extern int sysctl_resched_latency_warn_once;
|
||||
#endif
|
||||
|
||||
/*
|
||||
|
|
|
@ -266,6 +266,7 @@ static inline void init_sigpending(struct sigpending *sig)
|
|||
}
|
||||
|
||||
extern void flush_sigqueue(struct sigpending *queue);
|
||||
extern void exit_task_sigqueue_cache(struct task_struct *tsk);
|
||||
|
||||
/* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */
|
||||
static inline int valid_signal(unsigned long sig)
|
||||
|
|
|
@ -102,6 +102,16 @@ struct ptrace_syscall_info {
|
|||
};
|
||||
};
|
||||
|
||||
#define PTRACE_GET_RSEQ_CONFIGURATION 0x420f
|
||||
|
||||
struct ptrace_rseq_configuration {
|
||||
__u64 rseq_abi_pointer;
|
||||
__u32 rseq_abi_size;
|
||||
__u32 signature;
|
||||
__u32 flags;
|
||||
__u32 pad;
|
||||
};
|
||||
|
||||
/*
|
||||
* These values are stored in task->ptrace_message
|
||||
* by tracehook_report_syscall_* to describe the current syscall-stop.
|
||||
|
|
210
kernel/cpu.c
210
kernel/cpu.c
|
@ -63,6 +63,7 @@ struct cpuhp_cpu_state {
|
|||
bool rollback;
|
||||
bool single;
|
||||
bool bringup;
|
||||
int cpu;
|
||||
struct hlist_node *node;
|
||||
struct hlist_node *last;
|
||||
enum cpuhp_state cb_state;
|
||||
|
@ -135,6 +136,11 @@ static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state)
|
|||
return cpuhp_hp_states + state;
|
||||
}
|
||||
|
||||
static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
|
||||
{
|
||||
return bringup ? !step->startup.single : !step->teardown.single;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuhp_invoke_callback _ Invoke the callbacks for a given state
|
||||
* @cpu: The cpu for which the callback should be invoked
|
||||
|
@ -157,26 +163,24 @@ static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
|
|||
|
||||
if (st->fail == state) {
|
||||
st->fail = CPUHP_INVALID;
|
||||
|
||||
if (!(bringup ? step->startup.single : step->teardown.single))
|
||||
return 0;
|
||||
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
if (cpuhp_step_empty(bringup, step)) {
|
||||
WARN_ON_ONCE(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (!step->multi_instance) {
|
||||
WARN_ON_ONCE(lastp && *lastp);
|
||||
cb = bringup ? step->startup.single : step->teardown.single;
|
||||
if (!cb)
|
||||
return 0;
|
||||
|
||||
trace_cpuhp_enter(cpu, st->target, state, cb);
|
||||
ret = cb(cpu);
|
||||
trace_cpuhp_exit(cpu, st->state, state, ret);
|
||||
return ret;
|
||||
}
|
||||
cbm = bringup ? step->startup.multi : step->teardown.multi;
|
||||
if (!cbm)
|
||||
return 0;
|
||||
|
||||
/* Single invocation for instance add/remove */
|
||||
if (node) {
|
||||
|
@ -461,13 +465,16 @@ static inline enum cpuhp_state
|
|||
cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
|
||||
{
|
||||
enum cpuhp_state prev_state = st->state;
|
||||
bool bringup = st->state < target;
|
||||
|
||||
st->rollback = false;
|
||||
st->last = NULL;
|
||||
|
||||
st->target = target;
|
||||
st->single = false;
|
||||
st->bringup = st->state < target;
|
||||
st->bringup = bringup;
|
||||
if (cpu_dying(st->cpu) != !bringup)
|
||||
set_cpu_dying(st->cpu, !bringup);
|
||||
|
||||
return prev_state;
|
||||
}
|
||||
|
@ -475,6 +482,17 @@ cpuhp_set_state(struct cpuhp_cpu_state *st, enum cpuhp_state target)
|
|||
static inline void
|
||||
cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
|
||||
{
|
||||
bool bringup = !st->bringup;
|
||||
|
||||
st->target = prev_state;
|
||||
|
||||
/*
|
||||
* Already rolling back. No need invert the bringup value or to change
|
||||
* the current state.
|
||||
*/
|
||||
if (st->rollback)
|
||||
return;
|
||||
|
||||
st->rollback = true;
|
||||
|
||||
/*
|
||||
|
@ -488,8 +506,9 @@ cpuhp_reset_state(struct cpuhp_cpu_state *st, enum cpuhp_state prev_state)
|
|||
st->state++;
|
||||
}
|
||||
|
||||
st->target = prev_state;
|
||||
st->bringup = !st->bringup;
|
||||
st->bringup = bringup;
|
||||
if (cpu_dying(st->cpu) != !bringup)
|
||||
set_cpu_dying(st->cpu, !bringup);
|
||||
}
|
||||
|
||||
/* Regular hotplug invocation of the AP hotplug thread */
|
||||
|
@ -591,10 +610,53 @@ static int finish_cpu(unsigned int cpu)
|
|||
* Hotplug state machine related functions
|
||||
*/
|
||||
|
||||
static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
|
||||
/*
|
||||
* Get the next state to run. Empty ones will be skipped. Returns true if a
|
||||
* state must be run.
|
||||
*
|
||||
* st->state will be modified ahead of time, to match state_to_run, as if it
|
||||
* has already ran.
|
||||
*/
|
||||
static bool cpuhp_next_state(bool bringup,
|
||||
enum cpuhp_state *state_to_run,
|
||||
struct cpuhp_cpu_state *st,
|
||||
enum cpuhp_state target)
|
||||
{
|
||||
for (st->state--; st->state > st->target; st->state--)
|
||||
cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
|
||||
do {
|
||||
if (bringup) {
|
||||
if (st->state >= target)
|
||||
return false;
|
||||
|
||||
*state_to_run = ++st->state;
|
||||
} else {
|
||||
if (st->state <= target)
|
||||
return false;
|
||||
|
||||
*state_to_run = st->state--;
|
||||
}
|
||||
|
||||
if (!cpuhp_step_empty(bringup, cpuhp_get_step(*state_to_run)))
|
||||
break;
|
||||
} while (true);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int cpuhp_invoke_callback_range(bool bringup,
|
||||
unsigned int cpu,
|
||||
struct cpuhp_cpu_state *st,
|
||||
enum cpuhp_state target)
|
||||
{
|
||||
enum cpuhp_state state;
|
||||
int err = 0;
|
||||
|
||||
while (cpuhp_next_state(bringup, &state, st, target)) {
|
||||
err = cpuhp_invoke_callback(cpu, state, bringup, NULL, NULL);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
|
||||
|
@ -617,16 +679,12 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
|
|||
enum cpuhp_state prev_state = st->state;
|
||||
int ret = 0;
|
||||
|
||||
while (st->state < target) {
|
||||
st->state++;
|
||||
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
|
||||
if (ret) {
|
||||
if (can_rollback_cpu(st)) {
|
||||
st->target = prev_state;
|
||||
undo_cpu_up(cpu, st);
|
||||
}
|
||||
break;
|
||||
}
|
||||
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
|
||||
if (ret) {
|
||||
cpuhp_reset_state(st, prev_state);
|
||||
if (can_rollback_cpu(st))
|
||||
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
|
||||
prev_state));
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -640,6 +698,7 @@ static void cpuhp_create(unsigned int cpu)
|
|||
|
||||
init_completion(&st->done_up);
|
||||
init_completion(&st->done_down);
|
||||
st->cpu = cpu;
|
||||
}
|
||||
|
||||
static int cpuhp_should_run(unsigned int cpu)
|
||||
|
@ -690,17 +749,9 @@ static void cpuhp_thread_fun(unsigned int cpu)
|
|||
state = st->cb_state;
|
||||
st->should_run = false;
|
||||
} else {
|
||||
if (bringup) {
|
||||
st->state++;
|
||||
state = st->state;
|
||||
st->should_run = (st->state < st->target);
|
||||
WARN_ON_ONCE(st->state > st->target);
|
||||
} else {
|
||||
state = st->state;
|
||||
st->state--;
|
||||
st->should_run = (st->state > st->target);
|
||||
WARN_ON_ONCE(st->state < st->target);
|
||||
}
|
||||
st->should_run = cpuhp_next_state(bringup, &state, st, st->target);
|
||||
if (!st->should_run)
|
||||
goto end;
|
||||
}
|
||||
|
||||
WARN_ON_ONCE(!cpuhp_is_ap_state(state));
|
||||
|
@ -728,6 +779,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
|
|||
st->should_run = false;
|
||||
}
|
||||
|
||||
end:
|
||||
cpuhp_lock_release(bringup);
|
||||
lockdep_release_cpus_lock();
|
||||
|
||||
|
@ -881,19 +933,18 @@ static int take_cpu_down(void *_param)
|
|||
return err;
|
||||
|
||||
/*
|
||||
* We get here while we are in CPUHP_TEARDOWN_CPU state and we must not
|
||||
* do this step again.
|
||||
* Must be called from CPUHP_TEARDOWN_CPU, which means, as we are going
|
||||
* down, that the current state is CPUHP_TEARDOWN_CPU - 1.
|
||||
*/
|
||||
WARN_ON(st->state != CPUHP_TEARDOWN_CPU);
|
||||
st->state--;
|
||||
WARN_ON(st->state != (CPUHP_TEARDOWN_CPU - 1));
|
||||
|
||||
/* Invoke the former CPU_DYING callbacks */
|
||||
for (; st->state > target; st->state--) {
|
||||
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
|
||||
/*
|
||||
* DYING must not fail!
|
||||
*/
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
|
||||
|
||||
/*
|
||||
* DYING must not fail!
|
||||
*/
|
||||
WARN_ON_ONCE(ret);
|
||||
|
||||
/* Give up timekeeping duties */
|
||||
tick_handover_do_timer();
|
||||
|
@ -975,27 +1026,22 @@ void cpuhp_report_idle_dead(void)
|
|||
cpuhp_complete_idle_dead, st, 0);
|
||||
}
|
||||
|
||||
static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st)
|
||||
{
|
||||
for (st->state++; st->state < st->target; st->state++)
|
||||
cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
|
||||
}
|
||||
|
||||
static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
|
||||
enum cpuhp_state target)
|
||||
{
|
||||
enum cpuhp_state prev_state = st->state;
|
||||
int ret = 0;
|
||||
|
||||
for (; st->state > target; st->state--) {
|
||||
ret = cpuhp_invoke_callback(cpu, st->state, false, NULL, NULL);
|
||||
if (ret) {
|
||||
st->target = prev_state;
|
||||
if (st->state < prev_state)
|
||||
undo_cpu_down(cpu, st);
|
||||
break;
|
||||
}
|
||||
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
|
||||
if (ret) {
|
||||
|
||||
cpuhp_reset_state(st, prev_state);
|
||||
|
||||
if (st->state < prev_state)
|
||||
WARN_ON(cpuhp_invoke_callback_range(true, cpu, st,
|
||||
prev_state));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -1045,9 +1091,13 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
|
|||
* to do the further cleanups.
|
||||
*/
|
||||
ret = cpuhp_down_callbacks(cpu, st, target);
|
||||
if (ret && st->state == CPUHP_TEARDOWN_CPU && st->state < prev_state) {
|
||||
cpuhp_reset_state(st, prev_state);
|
||||
__cpuhp_kick_ap(st);
|
||||
if (ret && st->state < prev_state) {
|
||||
if (st->state == CPUHP_TEARDOWN_CPU) {
|
||||
cpuhp_reset_state(st, prev_state);
|
||||
__cpuhp_kick_ap(st);
|
||||
} else {
|
||||
WARN(1, "DEAD callback error for CPU%d", cpu);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
|
@ -1164,14 +1214,12 @@ void notify_cpu_starting(unsigned int cpu)
|
|||
|
||||
rcu_cpu_starting(cpu); /* Enables RCU usage on this CPU. */
|
||||
cpumask_set_cpu(cpu, &cpus_booted_once_mask);
|
||||
while (st->state < target) {
|
||||
st->state++;
|
||||
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
|
||||
/*
|
||||
* STARTING must not fail!
|
||||
*/
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
|
||||
|
||||
/*
|
||||
* STARTING must not fail!
|
||||
*/
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -1777,8 +1825,7 @@ static int cpuhp_issue_call(int cpu, enum cpuhp_state state, bool bringup,
|
|||
* If there's nothing to do, we done.
|
||||
* Relies on the union for multi_instance.
|
||||
*/
|
||||
if ((bringup && !sp->startup.single) ||
|
||||
(!bringup && !sp->teardown.single))
|
||||
if (cpuhp_step_empty(bringup, sp))
|
||||
return 0;
|
||||
/*
|
||||
* The non AP bound callbacks can fail on bringup. On teardown
|
||||
|
@ -2207,6 +2254,11 @@ static ssize_t write_cpuhp_fail(struct device *dev,
|
|||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (fail == CPUHP_INVALID) {
|
||||
st->fail = fail;
|
||||
return count;
|
||||
}
|
||||
|
||||
if (fail < CPUHP_OFFLINE || fail > CPUHP_ONLINE)
|
||||
return -EINVAL;
|
||||
|
||||
|
@ -2216,6 +2268,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
|
|||
if (cpuhp_is_atomic_state(fail))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* DEAD callbacks cannot fail...
|
||||
* ... neither can CPUHP_BRINGUP_CPU during hotunplug. The latter
|
||||
* triggering STARTING callbacks, a failure in this state would
|
||||
* hinder rollback.
|
||||
*/
|
||||
if (fail <= CPUHP_BRINGUP_CPU && st->state > CPUHP_BRINGUP_CPU)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Cannot fail anything that doesn't have callbacks.
|
||||
*/
|
||||
|
@ -2460,6 +2521,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
|
|||
struct cpumask __cpu_active_mask __read_mostly;
|
||||
EXPORT_SYMBOL(__cpu_active_mask);
|
||||
|
||||
struct cpumask __cpu_dying_mask __read_mostly;
|
||||
EXPORT_SYMBOL(__cpu_dying_mask);
|
||||
|
||||
atomic_t __num_online_cpus __read_mostly;
|
||||
EXPORT_SYMBOL(__num_online_cpus);
|
||||
|
||||
|
|
|
@ -162,6 +162,7 @@ static void __exit_signal(struct task_struct *tsk)
|
|||
flush_sigqueue(&sig->shared_pending);
|
||||
tty_kref_put(tty);
|
||||
}
|
||||
exit_task_sigqueue_cache(tsk);
|
||||
}
|
||||
|
||||
static void delayed_put_task_struct(struct rcu_head *rhp)
|
||||
|
|
|
@ -2009,6 +2009,7 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
spin_lock_init(&p->alloc_lock);
|
||||
|
||||
init_sigpending(&p->pending);
|
||||
p->sigqueue_cache = NULL;
|
||||
|
||||
p->utime = p->stime = p->gtime = 0;
|
||||
#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
|
||||
|
|
|
@ -84,6 +84,25 @@ static inline struct kthread *to_kthread(struct task_struct *k)
|
|||
return (__force void *)k->set_child_tid;
|
||||
}
|
||||
|
||||
/*
|
||||
* Variant of to_kthread() that doesn't assume @p is a kthread.
|
||||
*
|
||||
* Per construction; when:
|
||||
*
|
||||
* (p->flags & PF_KTHREAD) && p->set_child_tid
|
||||
*
|
||||
* the task is both a kthread and struct kthread is persistent. However
|
||||
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
|
||||
* begin_new_exec()).
|
||||
*/
|
||||
static inline struct kthread *__to_kthread(struct task_struct *p)
|
||||
{
|
||||
void *kthread = (__force void *)p->set_child_tid;
|
||||
if (kthread && !(p->flags & PF_KTHREAD))
|
||||
kthread = NULL;
|
||||
return kthread;
|
||||
}
|
||||
|
||||
void free_kthread_struct(struct task_struct *k)
|
||||
{
|
||||
struct kthread *kthread;
|
||||
|
@ -168,8 +187,9 @@ EXPORT_SYMBOL_GPL(kthread_freezable_should_stop);
|
|||
*/
|
||||
void *kthread_func(struct task_struct *task)
|
||||
{
|
||||
if (task->flags & PF_KTHREAD)
|
||||
return to_kthread(task)->threadfn;
|
||||
struct kthread *kthread = __to_kthread(task);
|
||||
if (kthread)
|
||||
return kthread->threadfn;
|
||||
return NULL;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_func);
|
||||
|
@ -199,10 +219,11 @@ EXPORT_SYMBOL_GPL(kthread_data);
|
|||
*/
|
||||
void *kthread_probe_data(struct task_struct *task)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(task);
|
||||
struct kthread *kthread = __to_kthread(task);
|
||||
void *data = NULL;
|
||||
|
||||
copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
|
||||
if (kthread)
|
||||
copy_from_kernel_nofault(&data, &kthread->data, sizeof(data));
|
||||
return data;
|
||||
}
|
||||
|
||||
|
@ -514,9 +535,9 @@ void kthread_set_per_cpu(struct task_struct *k, int cpu)
|
|||
set_bit(KTHREAD_IS_PER_CPU, &kthread->flags);
|
||||
}
|
||||
|
||||
bool kthread_is_per_cpu(struct task_struct *k)
|
||||
bool kthread_is_per_cpu(struct task_struct *p)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(k);
|
||||
struct kthread *kthread = __to_kthread(p);
|
||||
if (!kthread)
|
||||
return false;
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include <linux/cn_proc.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/minmax.h>
|
||||
|
||||
#include <asm/syscall.h> /* for syscall_get_* */
|
||||
|
||||
|
@ -779,6 +780,24 @@ static int ptrace_peek_siginfo(struct task_struct *child,
|
|||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_RSEQ
|
||||
static long ptrace_get_rseq_configuration(struct task_struct *task,
|
||||
unsigned long size, void __user *data)
|
||||
{
|
||||
struct ptrace_rseq_configuration conf = {
|
||||
.rseq_abi_pointer = (u64)(uintptr_t)task->rseq,
|
||||
.rseq_abi_size = sizeof(*task->rseq),
|
||||
.signature = task->rseq_sig,
|
||||
.flags = 0,
|
||||
};
|
||||
|
||||
size = min_t(unsigned long, size, sizeof(conf));
|
||||
if (copy_to_user(data, &conf, size))
|
||||
return -EFAULT;
|
||||
return sizeof(conf);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef PTRACE_SINGLESTEP
|
||||
#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
|
||||
#else
|
||||
|
@ -1222,6 +1241,12 @@ int ptrace_request(struct task_struct *child, long request,
|
|||
ret = seccomp_get_metadata(child, addr, datavp);
|
||||
break;
|
||||
|
||||
#ifdef CONFIG_RSEQ
|
||||
case PTRACE_GET_RSEQ_CONFIGURATION:
|
||||
ret = ptrace_get_rseq_configuration(child, addr, datavp);
|
||||
break;
|
||||
#endif
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -84,13 +84,20 @@
|
|||
static int rseq_update_cpu_id(struct task_struct *t)
|
||||
{
|
||||
u32 cpu_id = raw_smp_processor_id();
|
||||
struct rseq __user *rseq = t->rseq;
|
||||
|
||||
if (put_user(cpu_id, &t->rseq->cpu_id_start))
|
||||
return -EFAULT;
|
||||
if (put_user(cpu_id, &t->rseq->cpu_id))
|
||||
return -EFAULT;
|
||||
if (!user_write_access_begin(rseq, sizeof(*rseq)))
|
||||
goto efault;
|
||||
unsafe_put_user(cpu_id, &rseq->cpu_id_start, efault_end);
|
||||
unsafe_put_user(cpu_id, &rseq->cpu_id, efault_end);
|
||||
user_write_access_end();
|
||||
trace_rseq_update(t);
|
||||
return 0;
|
||||
|
||||
efault_end:
|
||||
user_write_access_end();
|
||||
efault:
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
static int rseq_reset_rseq_cpu_id(struct task_struct *t)
|
||||
|
@ -120,8 +127,13 @@ static int rseq_get_rseq_cs(struct task_struct *t, struct rseq_cs *rseq_cs)
|
|||
u32 sig;
|
||||
int ret;
|
||||
|
||||
#ifdef CONFIG_64BIT
|
||||
if (get_user(ptr, &t->rseq->rseq_cs.ptr64))
|
||||
return -EFAULT;
|
||||
#else
|
||||
if (copy_from_user(&ptr, &t->rseq->rseq_cs.ptr64, sizeof(ptr)))
|
||||
return -EFAULT;
|
||||
#endif
|
||||
if (!ptr) {
|
||||
memset(rseq_cs, 0, sizeof(*rseq_cs));
|
||||
return 0;
|
||||
|
@ -204,9 +216,13 @@ static int clear_rseq_cs(struct task_struct *t)
|
|||
*
|
||||
* Set rseq_cs to NULL.
|
||||
*/
|
||||
#ifdef CONFIG_64BIT
|
||||
return put_user(0UL, &t->rseq->rseq_cs.ptr64);
|
||||
#else
|
||||
if (clear_user(&t->rseq->rseq_cs.ptr64, sizeof(t->rseq->rseq_cs.ptr64)))
|
||||
return -EFAULT;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -266,8 +282,6 @@ void __rseq_handle_notify_resume(struct ksignal *ksig, struct pt_regs *regs)
|
|||
|
||||
if (unlikely(t->flags & PF_EXITING))
|
||||
return;
|
||||
if (unlikely(!access_ok(t->rseq, sizeof(*t->rseq))))
|
||||
goto error;
|
||||
ret = rseq_ip_fixup(regs);
|
||||
if (unlikely(ret < 0))
|
||||
goto error;
|
||||
|
@ -294,8 +308,7 @@ void rseq_syscall(struct pt_regs *regs)
|
|||
|
||||
if (!t->rseq)
|
||||
return;
|
||||
if (!access_ok(t->rseq, sizeof(*t->rseq)) ||
|
||||
rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
|
||||
if (rseq_get_rseq_cs(t, &rseq_cs) || in_rseq_cs(ip, &rseq_cs))
|
||||
force_sig(SIGSEGV);
|
||||
}
|
||||
|
||||
|
|
|
@ -41,7 +41,7 @@
|
|||
* Otherwise it tries to create a semi stable clock from a mixture of other
|
||||
* clocks, including:
|
||||
*
|
||||
* - GTOD (clock monotomic)
|
||||
* - GTOD (clock monotonic)
|
||||
* - sched_clock()
|
||||
* - explicit idle events
|
||||
*
|
||||
|
|
|
@ -58,7 +58,17 @@ const_debug unsigned int sysctl_sched_features =
|
|||
#include "features.h"
|
||||
0;
|
||||
#undef SCHED_FEAT
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Print a warning if need_resched is set for the given duration (if
|
||||
* LATENCY_WARN is enabled).
|
||||
*
|
||||
* If sysctl_resched_latency_warn_once is set, only one warning will be shown
|
||||
* per boot.
|
||||
*/
|
||||
__read_mostly int sysctl_resched_latency_warn_ms = 100;
|
||||
__read_mostly int sysctl_resched_latency_warn_once = 1;
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
/*
|
||||
* Number of tasks to iterate in a single balance run.
|
||||
|
@ -737,7 +747,7 @@ static void nohz_csd_func(void *info)
|
|||
/*
|
||||
* Release the rq::nohz_csd.
|
||||
*/
|
||||
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
|
||||
flags = atomic_fetch_andnot(NOHZ_KICK_MASK | NOHZ_NEWILB_KICK, nohz_flags(cpu));
|
||||
WARN_ON(!(flags & NOHZ_KICK_MASK));
|
||||
|
||||
rq->idle_balance = idle_cpu(cpu);
|
||||
|
@ -1811,7 +1821,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
|
|||
return cpu_online(cpu);
|
||||
|
||||
/* Regular kernel threads don't get to stay during offline. */
|
||||
if (cpu_rq(cpu)->balance_push)
|
||||
if (cpu_dying(cpu))
|
||||
return false;
|
||||
|
||||
/* But are allowed during online. */
|
||||
|
@ -1926,6 +1936,12 @@ static int migration_cpu_stop(void *data)
|
|||
raw_spin_lock(&p->pi_lock);
|
||||
rq_lock(rq, &rf);
|
||||
|
||||
/*
|
||||
* If we were passed a pending, then ->stop_pending was set, thus
|
||||
* p->migration_pending must have remained stable.
|
||||
*/
|
||||
WARN_ON_ONCE(pending && pending != p->migration_pending);
|
||||
|
||||
/*
|
||||
* If task_rq(p) != rq, it cannot be migrated here, because we're
|
||||
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
|
||||
|
@ -1936,8 +1952,7 @@ static int migration_cpu_stop(void *data)
|
|||
goto out;
|
||||
|
||||
if (pending) {
|
||||
if (p->migration_pending == pending)
|
||||
p->migration_pending = NULL;
|
||||
p->migration_pending = NULL;
|
||||
complete = true;
|
||||
}
|
||||
|
||||
|
@ -1976,8 +1991,7 @@ static int migration_cpu_stop(void *data)
|
|||
* somewhere allowed, we're done.
|
||||
*/
|
||||
if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
|
||||
if (p->migration_pending == pending)
|
||||
p->migration_pending = NULL;
|
||||
p->migration_pending = NULL;
|
||||
complete = true;
|
||||
goto out;
|
||||
}
|
||||
|
@ -2165,16 +2179,21 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
|
|||
*
|
||||
* (1) In the cases covered above. There is one more where the completion is
|
||||
* signaled within affine_move_task() itself: when a subsequent affinity request
|
||||
* cancels the need for an active migration. Consider:
|
||||
* occurs after the stopper bailed out due to the targeted task still being
|
||||
* Migrate-Disable. Consider:
|
||||
*
|
||||
* Initial conditions: P0->cpus_mask = [0, 1]
|
||||
*
|
||||
* P0@CPU0 P1 P2
|
||||
*
|
||||
* migrate_disable();
|
||||
* <preempted>
|
||||
* CPU0 P1 P2
|
||||
* <P0>
|
||||
* migrate_disable();
|
||||
* <preempted>
|
||||
* set_cpus_allowed_ptr(P0, [1]);
|
||||
* <blocks>
|
||||
* <migration/0>
|
||||
* migration_cpu_stop()
|
||||
* is_migration_disabled()
|
||||
* <bails>
|
||||
* set_cpus_allowed_ptr(P0, [0, 1]);
|
||||
* <signal completion>
|
||||
* <awakes>
|
||||
|
@ -4244,8 +4263,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
|
|||
asmlinkage __visible void schedule_tail(struct task_struct *prev)
|
||||
__releases(rq->lock)
|
||||
{
|
||||
struct rq *rq;
|
||||
|
||||
/*
|
||||
* New tasks start with FORK_PREEMPT_COUNT, see there and
|
||||
* finish_task_switch() for details.
|
||||
|
@ -4255,7 +4272,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
|
|||
* PREEMPT_COUNT kernels).
|
||||
*/
|
||||
|
||||
rq = finish_task_switch(prev);
|
||||
finish_task_switch(prev);
|
||||
preempt_enable();
|
||||
|
||||
if (current->set_child_tid)
|
||||
|
@ -4520,6 +4537,55 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
|||
return ns;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
static u64 cpu_resched_latency(struct rq *rq)
|
||||
{
|
||||
int latency_warn_ms = READ_ONCE(sysctl_resched_latency_warn_ms);
|
||||
u64 resched_latency, now = rq_clock(rq);
|
||||
static bool warned_once;
|
||||
|
||||
if (sysctl_resched_latency_warn_once && warned_once)
|
||||
return 0;
|
||||
|
||||
if (!need_resched() || !latency_warn_ms)
|
||||
return 0;
|
||||
|
||||
if (system_state == SYSTEM_BOOTING)
|
||||
return 0;
|
||||
|
||||
if (!rq->last_seen_need_resched_ns) {
|
||||
rq->last_seen_need_resched_ns = now;
|
||||
rq->ticks_without_resched = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
rq->ticks_without_resched++;
|
||||
resched_latency = now - rq->last_seen_need_resched_ns;
|
||||
if (resched_latency <= latency_warn_ms * NSEC_PER_MSEC)
|
||||
return 0;
|
||||
|
||||
warned_once = true;
|
||||
|
||||
return resched_latency;
|
||||
}
|
||||
|
||||
static int __init setup_resched_latency_warn_ms(char *str)
|
||||
{
|
||||
long val;
|
||||
|
||||
if ((kstrtol(str, 0, &val))) {
|
||||
pr_warn("Unable to set resched_latency_warn_ms\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
sysctl_resched_latency_warn_ms = val;
|
||||
return 1;
|
||||
}
|
||||
__setup("resched_latency_warn_ms=", setup_resched_latency_warn_ms);
|
||||
#else
|
||||
static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
/*
|
||||
* This function gets called by the timer code, with HZ frequency.
|
||||
* We call it with interrupts disabled.
|
||||
|
@ -4531,6 +4597,7 @@ void scheduler_tick(void)
|
|||
struct task_struct *curr = rq->curr;
|
||||
struct rq_flags rf;
|
||||
unsigned long thermal_pressure;
|
||||
u64 resched_latency;
|
||||
|
||||
arch_scale_freq_tick();
|
||||
sched_clock_tick();
|
||||
|
@ -4541,11 +4608,15 @@ void scheduler_tick(void)
|
|||
thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
|
||||
update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
|
||||
curr->sched_class->task_tick(rq, curr, 0);
|
||||
if (sched_feat(LATENCY_WARN))
|
||||
resched_latency = cpu_resched_latency(rq);
|
||||
calc_global_load_tick(rq);
|
||||
psi_task_tick(rq);
|
||||
|
||||
rq_unlock(rq, &rf);
|
||||
|
||||
if (sched_feat(LATENCY_WARN) && resched_latency)
|
||||
resched_latency_warn(cpu, resched_latency);
|
||||
|
||||
perf_event_task_tick();
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
@ -5040,6 +5111,9 @@ static void __sched notrace __schedule(bool preempt)
|
|||
next = pick_next_task(rq, prev, &rf);
|
||||
clear_tsk_need_resched(prev);
|
||||
clear_preempt_need_resched();
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
rq->last_seen_need_resched_ns = 0;
|
||||
#endif
|
||||
|
||||
if (likely(prev != next)) {
|
||||
rq->nr_switches++;
|
||||
|
@ -5365,23 +5439,23 @@ enum {
|
|||
preempt_dynamic_full,
|
||||
};
|
||||
|
||||
static int preempt_dynamic_mode = preempt_dynamic_full;
|
||||
int preempt_dynamic_mode = preempt_dynamic_full;
|
||||
|
||||
static int sched_dynamic_mode(const char *str)
|
||||
int sched_dynamic_mode(const char *str)
|
||||
{
|
||||
if (!strcmp(str, "none"))
|
||||
return 0;
|
||||
return preempt_dynamic_none;
|
||||
|
||||
if (!strcmp(str, "voluntary"))
|
||||
return 1;
|
||||
return preempt_dynamic_voluntary;
|
||||
|
||||
if (!strcmp(str, "full"))
|
||||
return 2;
|
||||
return preempt_dynamic_full;
|
||||
|
||||
return -1;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static void sched_dynamic_update(int mode)
|
||||
void sched_dynamic_update(int mode)
|
||||
{
|
||||
/*
|
||||
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
|
||||
|
@ -5438,77 +5512,8 @@ static int __init setup_preempt_mode(char *str)
|
|||
}
|
||||
__setup("preempt=", setup_preempt_mode);
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
|
||||
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
char buf[16];
|
||||
int mode;
|
||||
|
||||
if (cnt > 15)
|
||||
cnt = 15;
|
||||
|
||||
if (copy_from_user(&buf, ubuf, cnt))
|
||||
return -EFAULT;
|
||||
|
||||
buf[cnt] = 0;
|
||||
mode = sched_dynamic_mode(strstrip(buf));
|
||||
if (mode < 0)
|
||||
return mode;
|
||||
|
||||
sched_dynamic_update(mode);
|
||||
|
||||
*ppos += cnt;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static int sched_dynamic_show(struct seq_file *m, void *v)
|
||||
{
|
||||
static const char * preempt_modes[] = {
|
||||
"none", "voluntary", "full"
|
||||
};
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
|
||||
if (preempt_dynamic_mode == i)
|
||||
seq_puts(m, "(");
|
||||
seq_puts(m, preempt_modes[i]);
|
||||
if (preempt_dynamic_mode == i)
|
||||
seq_puts(m, ")");
|
||||
|
||||
seq_puts(m, " ");
|
||||
}
|
||||
|
||||
seq_puts(m, "\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sched_dynamic_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, sched_dynamic_show, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations sched_dynamic_fops = {
|
||||
.open = sched_dynamic_open,
|
||||
.write = sched_dynamic_write,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static __init int sched_init_debug_dynamic(void)
|
||||
{
|
||||
debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops);
|
||||
return 0;
|
||||
}
|
||||
late_initcall(sched_init_debug_dynamic);
|
||||
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
#endif /* CONFIG_PREEMPT_DYNAMIC */
|
||||
|
||||
|
||||
/*
|
||||
* This is the entry point to schedule() from kernel preemption
|
||||
* off of irq context.
|
||||
|
@ -7633,6 +7638,9 @@ static DEFINE_PER_CPU(struct cpu_stop_work, push_work);
|
|||
|
||||
/*
|
||||
* Ensure we only run per-cpu kthreads once the CPU goes !active.
|
||||
*
|
||||
* This is enabled below SCHED_AP_ACTIVE; when !cpu_active(), but only
|
||||
* effective when the hotplug motion is down.
|
||||
*/
|
||||
static void balance_push(struct rq *rq)
|
||||
{
|
||||
|
@ -7640,11 +7648,18 @@ static void balance_push(struct rq *rq)
|
|||
|
||||
lockdep_assert_held(&rq->lock);
|
||||
SCHED_WARN_ON(rq->cpu != smp_processor_id());
|
||||
|
||||
/*
|
||||
* Ensure the thing is persistent until balance_push_set(.on = false);
|
||||
*/
|
||||
rq->balance_callback = &balance_push_callback;
|
||||
|
||||
/*
|
||||
* Only active while going offline.
|
||||
*/
|
||||
if (!cpu_dying(rq->cpu))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Both the cpu-hotplug and stop task are in this case and are
|
||||
* required to complete the hotplug process.
|
||||
|
@ -7653,7 +7668,7 @@ static void balance_push(struct rq *rq)
|
|||
* histerical raisins.
|
||||
*/
|
||||
if (rq->idle == push_task ||
|
||||
((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) ||
|
||||
kthread_is_per_cpu(push_task) ||
|
||||
is_migration_disabled(push_task)) {
|
||||
|
||||
/*
|
||||
|
@ -7698,7 +7713,6 @@ static void balance_push_set(int cpu, bool on)
|
|||
struct rq_flags rf;
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
rq->balance_push = on;
|
||||
if (on) {
|
||||
WARN_ON_ONCE(rq->balance_callback);
|
||||
rq->balance_callback = &balance_push_callback;
|
||||
|
@ -7823,8 +7837,8 @@ int sched_cpu_activate(unsigned int cpu)
|
|||
struct rq_flags rf;
|
||||
|
||||
/*
|
||||
* Make sure that when the hotplug state machine does a roll-back
|
||||
* we clear balance_push. Ideally that would happen earlier...
|
||||
* Clear the balance_push callback and prepare to schedule
|
||||
* regular tasks.
|
||||
*/
|
||||
balance_push_set(cpu, false);
|
||||
|
||||
|
@ -8009,12 +8023,6 @@ int sched_cpu_dying(unsigned int cpu)
|
|||
}
|
||||
rq_unlock_irqrestore(rq, &rf);
|
||||
|
||||
/*
|
||||
* Now that the CPU is offline, make sure we're welcome
|
||||
* to new tasks once we come back up.
|
||||
*/
|
||||
balance_push_set(cpu, false);
|
||||
|
||||
calc_load_migrate(rq);
|
||||
update_max_interval();
|
||||
hrtick_clear(rq);
|
||||
|
@ -8199,7 +8207,7 @@ void __init sched_init(void)
|
|||
rq->sd = NULL;
|
||||
rq->rd = NULL;
|
||||
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
|
||||
rq->balance_callback = NULL;
|
||||
rq->balance_callback = &balance_push_callback;
|
||||
rq->active_balance = 0;
|
||||
rq->next_balance = jiffies;
|
||||
rq->push_cpu = 0;
|
||||
|
@ -8246,6 +8254,7 @@ void __init sched_init(void)
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
idle_thread_set_boot_cpu();
|
||||
balance_push_set(smp_processor_id(), false);
|
||||
#endif
|
||||
init_sched_fair_class();
|
||||
|
||||
|
@ -8970,7 +8979,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
|||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Likewise, bound things on the otherside by preventing insane quota
|
||||
* Likewise, bound things on the other side by preventing insane quota
|
||||
* periods. This also allows us to normalize in computing quota
|
||||
* feasibility.
|
||||
*/
|
||||
|
|
|
@ -104,7 +104,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
|
|||
|
||||
/*
|
||||
* We allow index == CPUACCT_STAT_NSTATS here to read
|
||||
* the sum of suages.
|
||||
* the sum of usages.
|
||||
*/
|
||||
BUG_ON(index > CPUACCT_STAT_NSTATS);
|
||||
|
||||
|
|
|
@ -466,7 +466,7 @@ static void sugov_work(struct kthread_work *work)
|
|||
|
||||
/*
|
||||
* Hold sg_policy->update_lock shortly to handle the case where:
|
||||
* incase sg_policy->next_freq is read here, and then updated by
|
||||
* in case sg_policy->next_freq is read here, and then updated by
|
||||
* sugov_deferred_update() just before work_in_progress is set to false
|
||||
* here, we may miss queueing the new update.
|
||||
*
|
||||
|
|
|
@ -77,7 +77,7 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p,
|
|||
* When looking at the vector, we need to read the counter,
|
||||
* do a memory barrier, then read the mask.
|
||||
*
|
||||
* Note: This is still all racey, but we can deal with it.
|
||||
* Note: This is still all racy, but we can deal with it.
|
||||
* Ideally, we only want to look at masks that are set.
|
||||
*
|
||||
* If a mask is not set, then the only thing wrong is that we
|
||||
|
@ -186,7 +186,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
|
|||
* The cost of this trade-off is not entirely clear and will probably
|
||||
* be good for some workloads and bad for others.
|
||||
*
|
||||
* The main idea here is that if some CPUs were overcommitted, we try
|
||||
* The main idea here is that if some CPUs were over-committed, we try
|
||||
* to spread which is what the scheduler traditionally did. Sys admins
|
||||
* must do proper RT planning to avoid overloading the system if they
|
||||
* really care.
|
||||
|
|
|
@ -563,7 +563,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
|
|||
|
||||
/*
|
||||
* If either stime or utime are 0, assume all runtime is userspace.
|
||||
* Once a task gets some ticks, the monotonicy code at 'update:'
|
||||
* Once a task gets some ticks, the monotonicity code at 'update:'
|
||||
* will ensure things converge to the observed ratio.
|
||||
*/
|
||||
if (stime == 0) {
|
||||
|
|
|
@ -245,7 +245,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
|
|||
p->dl.dl_non_contending = 0;
|
||||
/*
|
||||
* If the timer handler is currently running and the
|
||||
* timer cannot be cancelled, inactive_task_timer()
|
||||
* timer cannot be canceled, inactive_task_timer()
|
||||
* will see that dl_not_contending is not set, and
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
|
@ -267,7 +267,7 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
|
|||
* fires.
|
||||
*
|
||||
* If the task wakes up again before the inactive timer fires,
|
||||
* the timer is cancelled, whereas if the task wakes up after the
|
||||
* the timer is canceled, whereas if the task wakes up after the
|
||||
* inactive timer fired (and running_bw has been decreased) the
|
||||
* task's utilization has to be added to running_bw again.
|
||||
* A flag in the deadline scheduling entity (dl_non_contending)
|
||||
|
@ -385,7 +385,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags)
|
|||
dl_se->dl_non_contending = 0;
|
||||
/*
|
||||
* If the timer handler is currently running and the
|
||||
* timer cannot be cancelled, inactive_task_timer()
|
||||
* timer cannot be canceled, inactive_task_timer()
|
||||
* will see that dl_not_contending is not set, and
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
|
@ -1206,7 +1206,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
|
|||
* Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
|
||||
* multiplied by 2^BW_SHIFT, the result has to be shifted right by
|
||||
* BW_SHIFT.
|
||||
* Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
|
||||
* Since rq->dl.bw_ratio contains 1 / Umax multiplied by 2^RATIO_SHIFT,
|
||||
* dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
|
||||
* Since delta is a 64 bit variable, to have an overflow its value
|
||||
* should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
|
||||
|
@ -1737,7 +1737,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
|
|||
p->dl.dl_non_contending = 0;
|
||||
/*
|
||||
* If the timer handler is currently running and the
|
||||
* timer cannot be cancelled, inactive_task_timer()
|
||||
* timer cannot be canceled, inactive_task_timer()
|
||||
* will see that dl_not_contending is not set, and
|
||||
* will not touch the rq's active utilization,
|
||||
* so we are still safe.
|
||||
|
@ -2745,7 +2745,7 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
|
|||
|
||||
/*
|
||||
* Default limits for DL period; on the top end we guard against small util
|
||||
* tasks still getting rediculous long effective runtimes, on the bottom end we
|
||||
* tasks still getting ridiculously long effective runtimes, on the bottom end we
|
||||
* guard against timer DoS.
|
||||
*/
|
||||
unsigned int sysctl_sched_dl_period_max = 1 << 22; /* ~4 seconds */
|
||||
|
|
|
@ -8,8 +8,6 @@
|
|||
*/
|
||||
#include "sched.h"
|
||||
|
||||
static DEFINE_SPINLOCK(sched_debug_lock);
|
||||
|
||||
/*
|
||||
* This allows printing both to /proc/sched_debug and
|
||||
* to the console
|
||||
|
@ -169,15 +167,169 @@ static const struct file_operations sched_feat_fops = {
|
|||
.release = single_release,
|
||||
};
|
||||
|
||||
__read_mostly bool sched_debug_enabled;
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
char buf[16];
|
||||
|
||||
if (cnt > 15)
|
||||
cnt = 15;
|
||||
|
||||
if (copy_from_user(&buf, ubuf, cnt))
|
||||
return -EFAULT;
|
||||
|
||||
if (kstrtouint(buf, 10, &sysctl_sched_tunable_scaling))
|
||||
return -EINVAL;
|
||||
|
||||
if (sched_update_scaling())
|
||||
return -EINVAL;
|
||||
|
||||
*ppos += cnt;
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static int sched_scaling_show(struct seq_file *m, void *v)
|
||||
{
|
||||
seq_printf(m, "%d\n", sysctl_sched_tunable_scaling);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sched_scaling_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, sched_scaling_show, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations sched_scaling_fops = {
|
||||
.open = sched_scaling_open,
|
||||
.write = sched_scaling_write,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
#endif /* SMP */
|
||||
|
||||
#ifdef CONFIG_PREEMPT_DYNAMIC
|
||||
|
||||
static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf,
|
||||
size_t cnt, loff_t *ppos)
|
||||
{
|
||||
char buf[16];
|
||||
int mode;
|
||||
|
||||
if (cnt > 15)
|
||||
cnt = 15;
|
||||
|
||||
if (copy_from_user(&buf, ubuf, cnt))
|
||||
return -EFAULT;
|
||||
|
||||
buf[cnt] = 0;
|
||||
mode = sched_dynamic_mode(strstrip(buf));
|
||||
if (mode < 0)
|
||||
return mode;
|
||||
|
||||
sched_dynamic_update(mode);
|
||||
|
||||
*ppos += cnt;
|
||||
|
||||
return cnt;
|
||||
}
|
||||
|
||||
static int sched_dynamic_show(struct seq_file *m, void *v)
|
||||
{
|
||||
static const char * preempt_modes[] = {
|
||||
"none", "voluntary", "full"
|
||||
};
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) {
|
||||
if (preempt_dynamic_mode == i)
|
||||
seq_puts(m, "(");
|
||||
seq_puts(m, preempt_modes[i]);
|
||||
if (preempt_dynamic_mode == i)
|
||||
seq_puts(m, ")");
|
||||
|
||||
seq_puts(m, " ");
|
||||
}
|
||||
|
||||
seq_puts(m, "\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sched_dynamic_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return single_open(filp, sched_dynamic_show, NULL);
|
||||
}
|
||||
|
||||
static const struct file_operations sched_dynamic_fops = {
|
||||
.open = sched_dynamic_open,
|
||||
.write = sched_dynamic_write,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
#endif /* CONFIG_PREEMPT_DYNAMIC */
|
||||
|
||||
__read_mostly bool sched_debug_verbose;
|
||||
|
||||
static const struct seq_operations sched_debug_sops;
|
||||
|
||||
static int sched_debug_open(struct inode *inode, struct file *filp)
|
||||
{
|
||||
return seq_open(filp, &sched_debug_sops);
|
||||
}
|
||||
|
||||
static const struct file_operations sched_debug_fops = {
|
||||
.open = sched_debug_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = seq_release,
|
||||
};
|
||||
|
||||
static struct dentry *debugfs_sched;
|
||||
|
||||
static __init int sched_init_debug(void)
|
||||
{
|
||||
debugfs_create_file("sched_features", 0644, NULL, NULL,
|
||||
&sched_feat_fops);
|
||||
struct dentry __maybe_unused *numa;
|
||||
|
||||
debugfs_create_bool("sched_debug", 0644, NULL,
|
||||
&sched_debug_enabled);
|
||||
debugfs_sched = debugfs_create_dir("sched", NULL);
|
||||
|
||||
debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
|
||||
debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
|
||||
#ifdef CONFIG_PREEMPT_DYNAMIC
|
||||
debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
|
||||
#endif
|
||||
|
||||
debugfs_create_u32("latency_ns", 0644, debugfs_sched, &sysctl_sched_latency);
|
||||
debugfs_create_u32("min_granularity_ns", 0644, debugfs_sched, &sysctl_sched_min_granularity);
|
||||
debugfs_create_u32("wakeup_granularity_ns", 0644, debugfs_sched, &sysctl_sched_wakeup_granularity);
|
||||
|
||||
debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms);
|
||||
debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once);
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops);
|
||||
debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost);
|
||||
debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate);
|
||||
|
||||
mutex_lock(&sched_domains_mutex);
|
||||
update_sched_domain_debugfs();
|
||||
mutex_unlock(&sched_domains_mutex);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
numa = debugfs_create_dir("numa_balancing", debugfs_sched);
|
||||
|
||||
debugfs_create_u32("scan_delay_ms", 0644, numa, &sysctl_numa_balancing_scan_delay);
|
||||
debugfs_create_u32("scan_period_min_ms", 0644, numa, &sysctl_numa_balancing_scan_period_min);
|
||||
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
|
||||
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
|
||||
#endif
|
||||
|
||||
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -185,229 +337,88 @@ late_initcall(sched_init_debug);
|
|||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
static cpumask_var_t sd_sysctl_cpus;
|
||||
static struct dentry *sd_dentry;
|
||||
|
||||
static struct ctl_table sd_ctl_dir[] = {
|
||||
{
|
||||
.procname = "sched_domain",
|
||||
.mode = 0555,
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
static struct ctl_table sd_ctl_root[] = {
|
||||
{
|
||||
.procname = "kernel",
|
||||
.mode = 0555,
|
||||
.child = sd_ctl_dir,
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
static struct ctl_table *sd_alloc_ctl_entry(int n)
|
||||
static int sd_flags_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct ctl_table *entry =
|
||||
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
|
||||
|
||||
return entry;
|
||||
}
|
||||
|
||||
static void sd_free_ctl_entry(struct ctl_table **tablep)
|
||||
{
|
||||
struct ctl_table *entry;
|
||||
|
||||
/*
|
||||
* In the intermediate directories, both the child directory and
|
||||
* procname are dynamically allocated and could fail but the mode
|
||||
* will always be set. In the lowest directory the names are
|
||||
* static strings and all have proc handlers.
|
||||
*/
|
||||
for (entry = *tablep; entry->mode; entry++) {
|
||||
if (entry->child)
|
||||
sd_free_ctl_entry(&entry->child);
|
||||
if (entry->proc_handler == NULL)
|
||||
kfree(entry->procname);
|
||||
}
|
||||
|
||||
kfree(*tablep);
|
||||
*tablep = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
set_table_entry(struct ctl_table *entry,
|
||||
const char *procname, void *data, int maxlen,
|
||||
umode_t mode, proc_handler *proc_handler)
|
||||
{
|
||||
entry->procname = procname;
|
||||
entry->data = data;
|
||||
entry->maxlen = maxlen;
|
||||
entry->mode = mode;
|
||||
entry->proc_handler = proc_handler;
|
||||
}
|
||||
|
||||
static int sd_ctl_doflags(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
unsigned long flags = *(unsigned long *)table->data;
|
||||
size_t data_size = 0;
|
||||
size_t len = 0;
|
||||
char *tmp, *buf;
|
||||
unsigned long flags = *(unsigned int *)m->private;
|
||||
int idx;
|
||||
|
||||
if (write)
|
||||
return 0;
|
||||
|
||||
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
|
||||
char *name = sd_flag_debug[idx].name;
|
||||
|
||||
/* Name plus whitespace */
|
||||
data_size += strlen(name) + 1;
|
||||
seq_puts(m, sd_flag_debug[idx].name);
|
||||
seq_puts(m, " ");
|
||||
}
|
||||
|
||||
if (*ppos > data_size) {
|
||||
*lenp = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
|
||||
char *name = sd_flag_debug[idx].name;
|
||||
|
||||
len += snprintf(buf + len, strlen(name) + 2, "%s ", name);
|
||||
}
|
||||
|
||||
tmp = buf + *ppos;
|
||||
len -= *ppos;
|
||||
|
||||
if (len > *lenp)
|
||||
len = *lenp;
|
||||
if (len)
|
||||
memcpy(buffer, tmp, len);
|
||||
if (len < *lenp) {
|
||||
((char *)buffer)[len] = '\n';
|
||||
len++;
|
||||
}
|
||||
|
||||
*lenp = len;
|
||||
*ppos += len;
|
||||
|
||||
kfree(buf);
|
||||
seq_puts(m, "\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ctl_table *
|
||||
sd_alloc_ctl_domain_table(struct sched_domain *sd)
|
||||
static int sd_flags_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
struct ctl_table *table = sd_alloc_ctl_entry(9);
|
||||
|
||||
if (table == NULL)
|
||||
return NULL;
|
||||
|
||||
set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax);
|
||||
set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax);
|
||||
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
|
||||
set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
|
||||
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
|
||||
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
|
||||
/* &table[8] is terminator */
|
||||
|
||||
return table;
|
||||
return single_open(file, sd_flags_show, inode->i_private);
|
||||
}
|
||||
|
||||
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
|
||||
static const struct file_operations sd_flags_fops = {
|
||||
.open = sd_flags_open,
|
||||
.read = seq_read,
|
||||
.llseek = seq_lseek,
|
||||
.release = single_release,
|
||||
};
|
||||
|
||||
static void register_sd(struct sched_domain *sd, struct dentry *parent)
|
||||
{
|
||||
struct ctl_table *entry, *table;
|
||||
struct sched_domain *sd;
|
||||
int domain_num = 0, i;
|
||||
char buf[32];
|
||||
#define SDM(type, mode, member) \
|
||||
debugfs_create_##type(#member, mode, parent, &sd->member)
|
||||
|
||||
for_each_domain(cpu, sd)
|
||||
domain_num++;
|
||||
entry = table = sd_alloc_ctl_entry(domain_num + 1);
|
||||
if (table == NULL)
|
||||
return NULL;
|
||||
SDM(ulong, 0644, min_interval);
|
||||
SDM(ulong, 0644, max_interval);
|
||||
SDM(u64, 0644, max_newidle_lb_cost);
|
||||
SDM(u32, 0644, busy_factor);
|
||||
SDM(u32, 0644, imbalance_pct);
|
||||
SDM(u32, 0644, cache_nice_tries);
|
||||
SDM(str, 0444, name);
|
||||
|
||||
i = 0;
|
||||
for_each_domain(cpu, sd) {
|
||||
snprintf(buf, 32, "domain%d", i);
|
||||
entry->procname = kstrdup(buf, GFP_KERNEL);
|
||||
entry->mode = 0555;
|
||||
entry->child = sd_alloc_ctl_domain_table(sd);
|
||||
entry++;
|
||||
i++;
|
||||
}
|
||||
return table;
|
||||
#undef SDM
|
||||
|
||||
debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops);
|
||||
}
|
||||
|
||||
static cpumask_var_t sd_sysctl_cpus;
|
||||
static struct ctl_table_header *sd_sysctl_header;
|
||||
|
||||
void register_sched_domain_sysctl(void)
|
||||
void update_sched_domain_debugfs(void)
|
||||
{
|
||||
static struct ctl_table *cpu_entries;
|
||||
static struct ctl_table **cpu_idx;
|
||||
static bool init_done = false;
|
||||
char buf[32];
|
||||
int i;
|
||||
|
||||
if (!cpu_entries) {
|
||||
cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
|
||||
if (!cpu_entries)
|
||||
return;
|
||||
|
||||
WARN_ON(sd_ctl_dir[0].child);
|
||||
sd_ctl_dir[0].child = cpu_entries;
|
||||
}
|
||||
|
||||
if (!cpu_idx) {
|
||||
struct ctl_table *e = cpu_entries;
|
||||
|
||||
cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
|
||||
if (!cpu_idx)
|
||||
return;
|
||||
|
||||
/* deal with sparse possible map */
|
||||
for_each_possible_cpu(i) {
|
||||
cpu_idx[i] = e;
|
||||
e++;
|
||||
}
|
||||
}
|
||||
int cpu, i;
|
||||
|
||||
if (!cpumask_available(sd_sysctl_cpus)) {
|
||||
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
|
||||
return;
|
||||
}
|
||||
|
||||
if (!init_done) {
|
||||
init_done = true;
|
||||
/* init to possible to not have holes in @cpu_entries */
|
||||
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
|
||||
}
|
||||
|
||||
for_each_cpu(i, sd_sysctl_cpus) {
|
||||
struct ctl_table *e = cpu_idx[i];
|
||||
if (!sd_dentry)
|
||||
sd_dentry = debugfs_create_dir("domains", debugfs_sched);
|
||||
|
||||
if (e->child)
|
||||
sd_free_ctl_entry(&e->child);
|
||||
for_each_cpu(cpu, sd_sysctl_cpus) {
|
||||
struct sched_domain *sd;
|
||||
struct dentry *d_cpu;
|
||||
char buf[32];
|
||||
|
||||
if (!e->procname) {
|
||||
snprintf(buf, 32, "cpu%d", i);
|
||||
e->procname = kstrdup(buf, GFP_KERNEL);
|
||||
snprintf(buf, sizeof(buf), "cpu%d", cpu);
|
||||
debugfs_remove(debugfs_lookup(buf, sd_dentry));
|
||||
d_cpu = debugfs_create_dir(buf, sd_dentry);
|
||||
|
||||
i = 0;
|
||||
for_each_domain(cpu, sd) {
|
||||
struct dentry *d_sd;
|
||||
|
||||
snprintf(buf, sizeof(buf), "domain%d", i);
|
||||
d_sd = debugfs_create_dir(buf, d_cpu);
|
||||
|
||||
register_sd(sd, d_sd);
|
||||
i++;
|
||||
}
|
||||
e->mode = 0555;
|
||||
e->child = sd_alloc_ctl_cpu_table(i);
|
||||
|
||||
__cpumask_clear_cpu(i, sd_sysctl_cpus);
|
||||
__cpumask_clear_cpu(cpu, sd_sysctl_cpus);
|
||||
}
|
||||
|
||||
WARN_ON(sd_sysctl_header);
|
||||
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
|
||||
}
|
||||
|
||||
void dirty_sched_domain_sysctl(int cpu)
|
||||
|
@ -416,13 +427,6 @@ void dirty_sched_domain_sysctl(int cpu)
|
|||
__cpumask_set_cpu(cpu, sd_sysctl_cpus);
|
||||
}
|
||||
|
||||
/* may be called multiple times per register */
|
||||
void unregister_sched_domain_sysctl(void)
|
||||
{
|
||||
unregister_sysctl_table(sd_sysctl_header);
|
||||
sd_sysctl_header = NULL;
|
||||
}
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
|
@ -470,16 +474,37 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
|
|||
#endif
|
||||
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
static DEFINE_SPINLOCK(sched_debug_lock);
|
||||
static char group_path[PATH_MAX];
|
||||
|
||||
static char *task_group_path(struct task_group *tg)
|
||||
static void task_group_path(struct task_group *tg, char *path, int plen)
|
||||
{
|
||||
if (autogroup_path(tg, group_path, PATH_MAX))
|
||||
return group_path;
|
||||
if (autogroup_path(tg, path, plen))
|
||||
return;
|
||||
|
||||
cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
|
||||
cgroup_path(tg->css.cgroup, path, plen);
|
||||
}
|
||||
|
||||
return group_path;
|
||||
/*
|
||||
* Only 1 SEQ_printf_task_group_path() caller can use the full length
|
||||
* group_path[] for cgroup path. Other simultaneous callers will have
|
||||
* to use a shorter stack buffer. A "..." suffix is appended at the end
|
||||
* of the stack buffer so that it will show up in case the output length
|
||||
* matches the given buffer size to indicate possible path name truncation.
|
||||
*/
|
||||
#define SEQ_printf_task_group_path(m, tg, fmt...) \
|
||||
{ \
|
||||
if (spin_trylock(&sched_debug_lock)) { \
|
||||
task_group_path(tg, group_path, sizeof(group_path)); \
|
||||
SEQ_printf(m, fmt, group_path); \
|
||||
spin_unlock(&sched_debug_lock); \
|
||||
} else { \
|
||||
char buf[128]; \
|
||||
char *bufend = buf + sizeof(buf) - 3; \
|
||||
task_group_path(tg, buf, bufend - buf); \
|
||||
strcpy(bufend - 1, "..."); \
|
||||
SEQ_printf(m, fmt, buf); \
|
||||
} \
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -506,7 +531,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
|
|||
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_SCHED
|
||||
SEQ_printf(m, " %s", task_group_path(task_group(p)));
|
||||
SEQ_printf_task_group_path(m, task_group(p), " %s")
|
||||
#endif
|
||||
|
||||
SEQ_printf(m, "\n");
|
||||
|
@ -543,7 +568,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
|
|||
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
SEQ_printf(m, "\n");
|
||||
SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
|
||||
SEQ_printf_task_group_path(m, cfs_rq->tg, "cfs_rq[%d]:%s\n", cpu);
|
||||
#else
|
||||
SEQ_printf(m, "\n");
|
||||
SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
|
||||
|
@ -614,7 +639,7 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
|
|||
{
|
||||
#ifdef CONFIG_RT_GROUP_SCHED
|
||||
SEQ_printf(m, "\n");
|
||||
SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
|
||||
SEQ_printf_task_group_path(m, rt_rq->tg, "rt_rq[%d]:%s\n", cpu);
|
||||
#else
|
||||
SEQ_printf(m, "\n");
|
||||
SEQ_printf(m, "rt_rq[%d]:\n", cpu);
|
||||
|
@ -666,7 +691,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
|
|||
static void print_cpu(struct seq_file *m, int cpu)
|
||||
{
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
unsigned long flags;
|
||||
|
||||
#ifdef CONFIG_X86
|
||||
{
|
||||
|
@ -717,13 +741,11 @@ do { \
|
|||
}
|
||||
#undef P
|
||||
|
||||
spin_lock_irqsave(&sched_debug_lock, flags);
|
||||
print_cfs_stats(m, cpu);
|
||||
print_rt_stats(m, cpu);
|
||||
print_dl_stats(m, cpu);
|
||||
|
||||
print_rq(m, rq, cpu);
|
||||
spin_unlock_irqrestore(&sched_debug_lock, flags);
|
||||
SEQ_printf(m, "\n");
|
||||
}
|
||||
|
||||
|
@ -815,7 +837,7 @@ void sysrq_sched_debug_show(void)
|
|||
}
|
||||
|
||||
/*
|
||||
* This itererator needs some explanation.
|
||||
* This iterator needs some explanation.
|
||||
* It returns 1 for the header position.
|
||||
* This means 2 is CPU 0.
|
||||
* In a hotplugged system some CPUs, including CPU 0, may be missing so we have
|
||||
|
@ -860,15 +882,6 @@ static const struct seq_operations sched_debug_sops = {
|
|||
.show = sched_debug_show,
|
||||
};
|
||||
|
||||
static int __init init_sched_debug_procfs(void)
|
||||
{
|
||||
if (!proc_create_seq("sched_debug", 0444, NULL, &sched_debug_sops))
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
__initcall(init_sched_debug_procfs);
|
||||
|
||||
#define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F))
|
||||
#define __P(F) __PS(#F, F)
|
||||
#define P(F) __PS(#F, p->F)
|
||||
|
@ -1033,3 +1046,13 @@ void proc_sched_set_task(struct task_struct *p)
|
|||
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
|
||||
#endif
|
||||
}
|
||||
|
||||
void resched_latency_warn(int cpu, u64 latency)
|
||||
{
|
||||
static DEFINE_RATELIMIT_STATE(latency_check_ratelimit, 60 * 60 * HZ, 1);
|
||||
|
||||
WARN(__ratelimit(&latency_check_ratelimit),
|
||||
"sched: CPU %d need_resched set for > %llu ns (%d ticks) "
|
||||
"without schedule\n",
|
||||
cpu, latency, cpu_rq(cpu)->ticks_without_resched);
|
||||
}
|
||||
|
|
|
@ -49,7 +49,7 @@ static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
|
|||
*
|
||||
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
|
||||
*/
|
||||
enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
||||
unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
|
||||
|
||||
/*
|
||||
* Minimal preemption granularity for CPU-bound tasks:
|
||||
|
@ -113,6 +113,13 @@ int __weak arch_asym_cpu_priority(int cpu)
|
|||
*/
|
||||
#define fits_capacity(cap, max) ((cap) * 1280 < (max) * 1024)
|
||||
|
||||
/*
|
||||
* The margin used when comparing CPU capacities.
|
||||
* is 'cap1' noticeably greater than 'cap2'
|
||||
*
|
||||
* (default: ~5%)
|
||||
*/
|
||||
#define capacity_greater(cap1, cap2) ((cap1) * 1024 > (cap2) * 1078)
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_CFS_BANDWIDTH
|
||||
|
@ -229,22 +236,25 @@ static void __update_inv_weight(struct load_weight *lw)
|
|||
static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
|
||||
{
|
||||
u64 fact = scale_load_down(weight);
|
||||
u32 fact_hi = (u32)(fact >> 32);
|
||||
int shift = WMULT_SHIFT;
|
||||
int fs;
|
||||
|
||||
__update_inv_weight(lw);
|
||||
|
||||
if (unlikely(fact >> 32)) {
|
||||
while (fact >> 32) {
|
||||
fact >>= 1;
|
||||
shift--;
|
||||
}
|
||||
if (unlikely(fact_hi)) {
|
||||
fs = fls(fact_hi);
|
||||
shift -= fs;
|
||||
fact >>= fs;
|
||||
}
|
||||
|
||||
fact = mul_u32_u32(fact, lw->inv_weight);
|
||||
|
||||
while (fact >> 32) {
|
||||
fact >>= 1;
|
||||
shift--;
|
||||
fact_hi = (u32)(fact >> 32);
|
||||
if (fact_hi) {
|
||||
fs = fls(fact_hi);
|
||||
shift -= fs;
|
||||
fact >>= fs;
|
||||
}
|
||||
|
||||
return mul_u64_u32_shr(delta_exec, fact, shift);
|
||||
|
@ -624,15 +634,10 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
|
|||
* Scheduling class statistics methods:
|
||||
*/
|
||||
|
||||
int sched_proc_update_handler(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
int sched_update_scaling(void)
|
||||
{
|
||||
int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
unsigned int factor = get_update_sysctl_factor();
|
||||
|
||||
if (ret || !write)
|
||||
return ret;
|
||||
|
||||
sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
|
||||
sysctl_sched_min_granularity);
|
||||
|
||||
|
@ -682,7 +687,13 @@ static u64 __sched_period(unsigned long nr_running)
|
|||
*/
|
||||
static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
|
||||
unsigned int nr_running = cfs_rq->nr_running;
|
||||
u64 slice;
|
||||
|
||||
if (sched_feat(ALT_PERIOD))
|
||||
nr_running = rq_of(cfs_rq)->cfs.h_nr_running;
|
||||
|
||||
slice = __sched_period(nr_running + !se->on_rq);
|
||||
|
||||
for_each_sched_entity(se) {
|
||||
struct load_weight *load;
|
||||
|
@ -699,6 +710,10 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
}
|
||||
slice = __calc_delta(slice, se->load.weight, load);
|
||||
}
|
||||
|
||||
if (sched_feat(BASE_SLICE))
|
||||
slice = max(slice, (u64)sysctl_sched_min_granularity);
|
||||
|
||||
return slice;
|
||||
}
|
||||
|
||||
|
@ -1122,7 +1137,7 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
|
|||
return rss / nr_scan_pages;
|
||||
}
|
||||
|
||||
/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
|
||||
/* For sanity's sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
|
||||
#define MAX_SCAN_WINDOW 2560
|
||||
|
||||
static unsigned int task_scan_min(struct task_struct *p)
|
||||
|
@ -2574,7 +2589,7 @@ no_join:
|
|||
}
|
||||
|
||||
/*
|
||||
* Get rid of NUMA staticstics associated with a task (either current or dead).
|
||||
* Get rid of NUMA statistics associated with a task (either current or dead).
|
||||
* If @final is set, the task is dead and has reached refcount zero, so we can
|
||||
* safely free all relevant data structures. Otherwise, there might be
|
||||
* concurrent reads from places like load balancing and procfs, and we should
|
||||
|
@ -3941,13 +3956,15 @@ static inline void util_est_dequeue(struct cfs_rq *cfs_rq,
|
|||
trace_sched_util_est_cfs_tp(cfs_rq);
|
||||
}
|
||||
|
||||
#define UTIL_EST_MARGIN (SCHED_CAPACITY_SCALE / 100)
|
||||
|
||||
/*
|
||||
* Check if a (signed) value is within a specified (unsigned) margin,
|
||||
* based on the observation that:
|
||||
*
|
||||
* abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
|
||||
*
|
||||
* NOTE: this only works when value + maring < INT_MAX.
|
||||
* NOTE: this only works when value + margin < INT_MAX.
|
||||
*/
|
||||
static inline bool within_margin(int value, int margin)
|
||||
{
|
||||
|
@ -3958,7 +3975,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
|||
struct task_struct *p,
|
||||
bool task_sleep)
|
||||
{
|
||||
long last_ewma_diff;
|
||||
long last_ewma_diff, last_enqueued_diff;
|
||||
struct util_est ue;
|
||||
|
||||
if (!sched_feat(UTIL_EST))
|
||||
|
@ -3979,6 +3996,8 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
|||
if (ue.enqueued & UTIL_AVG_UNCHANGED)
|
||||
return;
|
||||
|
||||
last_enqueued_diff = ue.enqueued;
|
||||
|
||||
/*
|
||||
* Reset EWMA on utilization increases, the moving average is used only
|
||||
* to smooth utilization decreases.
|
||||
|
@ -3992,12 +4011,17 @@ static inline void util_est_update(struct cfs_rq *cfs_rq,
|
|||
}
|
||||
|
||||
/*
|
||||
* Skip update of task's estimated utilization when its EWMA is
|
||||
* Skip update of task's estimated utilization when its members are
|
||||
* already ~1% close to its last activation value.
|
||||
*/
|
||||
last_ewma_diff = ue.enqueued - ue.ewma;
|
||||
if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
|
||||
last_enqueued_diff -= ue.enqueued;
|
||||
if (within_margin(last_ewma_diff, UTIL_EST_MARGIN)) {
|
||||
if (!within_margin(last_enqueued_diff, UTIL_EST_MARGIN))
|
||||
goto done;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* To avoid overestimation of actual task utilization, skip updates if
|
||||
|
@ -4244,7 +4268,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
|
|||
/*
|
||||
* When bandwidth control is enabled, cfs might have been removed
|
||||
* because of a parent been throttled but cfs->nr_running > 1. Try to
|
||||
* add it unconditionnally.
|
||||
* add it unconditionally.
|
||||
*/
|
||||
if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
|
||||
list_add_leaf_cfs_rq(cfs_rq);
|
||||
|
@ -5299,7 +5323,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
|||
* bits doesn't do much.
|
||||
*/
|
||||
|
||||
/* cpu online calback */
|
||||
/* cpu online callback */
|
||||
static void __maybe_unused update_runtime_enabled(struct rq *rq)
|
||||
{
|
||||
struct task_group *tg;
|
||||
|
@ -6098,6 +6122,24 @@ static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpu
|
|||
return -1;
|
||||
}
|
||||
|
||||
/*
|
||||
* Scan the local SMT mask for idle CPUs.
|
||||
*/
|
||||
static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, cpu_smt_mask(target)) {
|
||||
if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
|
||||
!cpumask_test_cpu(cpu, sched_domain_span(sd)))
|
||||
continue;
|
||||
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
|
||||
return cpu;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
#else /* CONFIG_SCHED_SMT */
|
||||
|
||||
static inline void set_idle_cores(int cpu, int val)
|
||||
|
@ -6114,6 +6156,11 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
|
|||
return __select_idle_cpu(core);
|
||||
}
|
||||
|
||||
static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
{
|
||||
return -1;
|
||||
}
|
||||
|
||||
#endif /* CONFIG_SCHED_SMT */
|
||||
|
||||
/*
|
||||
|
@ -6121,11 +6168,10 @@ static inline int select_idle_core(struct task_struct *p, int core, struct cpuma
|
|||
* comparing the average scan cost (tracked in sd->avg_scan_cost) against the
|
||||
* average idle time for this rq (as found in rq->avg_idle).
|
||||
*/
|
||||
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
|
||||
static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool has_idle_core, int target)
|
||||
{
|
||||
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
|
||||
int i, cpu, idle_cpu = -1, nr = INT_MAX;
|
||||
bool smt = test_idle_cores(target, false);
|
||||
int this = smp_processor_id();
|
||||
struct sched_domain *this_sd;
|
||||
u64 time;
|
||||
|
@ -6136,7 +6182,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
|||
|
||||
cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
|
||||
|
||||
if (sched_feat(SIS_PROP) && !smt) {
|
||||
if (sched_feat(SIS_PROP) && !has_idle_core) {
|
||||
u64 avg_cost, avg_idle, span_avg;
|
||||
|
||||
/*
|
||||
|
@ -6156,7 +6202,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
|||
}
|
||||
|
||||
for_each_cpu_wrap(cpu, cpus, target) {
|
||||
if (smt) {
|
||||
if (has_idle_core) {
|
||||
i = select_idle_core(p, cpu, cpus, &idle_cpu);
|
||||
if ((unsigned int)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
@ -6170,10 +6216,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
|
|||
}
|
||||
}
|
||||
|
||||
if (smt)
|
||||
if (has_idle_core)
|
||||
set_idle_cores(this, false);
|
||||
|
||||
if (sched_feat(SIS_PROP) && !smt) {
|
||||
if (sched_feat(SIS_PROP) && !has_idle_core) {
|
||||
time = cpu_clock(this) - time;
|
||||
update_avg(&this_sd->avg_scan_cost, time);
|
||||
}
|
||||
|
@ -6228,6 +6274,7 @@ static inline bool asym_fits_capacity(int task_util, int cpu)
|
|||
*/
|
||||
static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
||||
{
|
||||
bool has_idle_core = false;
|
||||
struct sched_domain *sd;
|
||||
unsigned long task_util;
|
||||
int i, recent_used_cpu;
|
||||
|
@ -6307,7 +6354,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
|
|||
if (!sd)
|
||||
return target;
|
||||
|
||||
i = select_idle_cpu(p, sd, target);
|
||||
if (sched_smt_active()) {
|
||||
has_idle_core = test_idle_cores(target, false);
|
||||
|
||||
if (!has_idle_core && cpus_share_cache(prev, target)) {
|
||||
i = select_idle_smt(p, sd, prev);
|
||||
if ((unsigned int)i < nr_cpumask_bits)
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
||||
i = select_idle_cpu(p, sd, has_idle_core, target);
|
||||
if ((unsigned)i < nr_cpumask_bits)
|
||||
return i;
|
||||
|
||||
|
@ -6471,7 +6528,7 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
|
|||
* util_avg should already be correct.
|
||||
*/
|
||||
if (task_cpu(p) == cpu && dst_cpu != cpu)
|
||||
sub_positive(&util, task_util(p));
|
||||
lsub_positive(&util, task_util(p));
|
||||
else if (task_cpu(p) != cpu && dst_cpu == cpu)
|
||||
util += task_util(p);
|
||||
|
||||
|
@ -6518,8 +6575,24 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
|
|||
* its pd list and will not be accounted by compute_energy().
|
||||
*/
|
||||
for_each_cpu_and(cpu, pd_mask, cpu_online_mask) {
|
||||
unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu);
|
||||
struct task_struct *tsk = cpu == dst_cpu ? p : NULL;
|
||||
unsigned long util_freq = cpu_util_next(cpu, p, dst_cpu);
|
||||
unsigned long cpu_util, util_running = util_freq;
|
||||
struct task_struct *tsk = NULL;
|
||||
|
||||
/*
|
||||
* When @p is placed on @cpu:
|
||||
*
|
||||
* util_running = max(cpu_util, cpu_util_est) +
|
||||
* max(task_util, _task_util_est)
|
||||
*
|
||||
* while cpu_util_next is: max(cpu_util + task_util,
|
||||
* cpu_util_est + _task_util_est)
|
||||
*/
|
||||
if (cpu == dst_cpu) {
|
||||
tsk = p;
|
||||
util_running =
|
||||
cpu_util_next(cpu, p, -1) + task_util_est(p);
|
||||
}
|
||||
|
||||
/*
|
||||
* Busy time computation: utilization clamping is not
|
||||
|
@ -6527,7 +6600,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
|
|||
* is already enough to scale the EM reported power
|
||||
* consumption at the (eventually clamped) cpu_capacity.
|
||||
*/
|
||||
sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap,
|
||||
sum_util += effective_cpu_util(cpu, util_running, cpu_cap,
|
||||
ENERGY_UTIL, NULL);
|
||||
|
||||
/*
|
||||
|
@ -6537,7 +6610,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
|
|||
* NOTE: in case RT tasks are running, by default the
|
||||
* FREQUENCY_UTIL's utilization can be max OPP.
|
||||
*/
|
||||
cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap,
|
||||
cpu_util = effective_cpu_util(cpu, util_freq, cpu_cap,
|
||||
FREQUENCY_UTIL, tsk);
|
||||
max_util = max(max_util, cpu_util);
|
||||
}
|
||||
|
@ -6935,7 +7008,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
|
|||
|
||||
/*
|
||||
* This is possible from callers such as attach_tasks(), in which we
|
||||
* unconditionally check_prempt_curr() after an enqueue (which may have
|
||||
* unconditionally check_preempt_curr() after an enqueue (which may have
|
||||
* lead to a throttle). This both saves work and prevents false
|
||||
* next-buddy nomination below.
|
||||
*/
|
||||
|
@ -7392,8 +7465,7 @@ enum migration_type {
|
|||
#define LBF_NEED_BREAK 0x02
|
||||
#define LBF_DST_PINNED 0x04
|
||||
#define LBF_SOME_PINNED 0x08
|
||||
#define LBF_NOHZ_STATS 0x10
|
||||
#define LBF_NOHZ_AGAIN 0x20
|
||||
#define LBF_ACTIVE_LB 0x10
|
||||
|
||||
struct lb_env {
|
||||
struct sched_domain *sd;
|
||||
|
@ -7539,6 +7611,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
|
||||
return 0;
|
||||
|
||||
/* Disregard pcpu kthreads; they are where they need to be. */
|
||||
if (kthread_is_per_cpu(p))
|
||||
return 0;
|
||||
|
||||
if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) {
|
||||
int cpu;
|
||||
|
||||
|
@ -7551,10 +7627,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||
* our sched_group. We may want to revisit it if we couldn't
|
||||
* meet load balance goals by pulling other tasks on src_cpu.
|
||||
*
|
||||
* Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
|
||||
* already computed one in current iteration.
|
||||
* Avoid computing new_dst_cpu
|
||||
* - for NEWLY_IDLE
|
||||
* - if we have already computed one in current iteration
|
||||
* - if it's an active balance
|
||||
*/
|
||||
if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
|
||||
if (env->idle == CPU_NEWLY_IDLE ||
|
||||
env->flags & (LBF_DST_PINNED | LBF_ACTIVE_LB))
|
||||
return 0;
|
||||
|
||||
/* Prevent to re-select dst_cpu via env's CPUs: */
|
||||
|
@ -7569,7 +7648,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Record that we found atleast one task that could run on dst_cpu */
|
||||
/* Record that we found at least one task that could run on dst_cpu */
|
||||
env->flags &= ~LBF_ALL_PINNED;
|
||||
|
||||
if (task_running(env->src_rq, p)) {
|
||||
|
@ -7579,10 +7658,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
|
|||
|
||||
/*
|
||||
* Aggressive migration if:
|
||||
* 1) destination numa is preferred
|
||||
* 2) task is cache cold, or
|
||||
* 3) too many balance attempts have failed.
|
||||
* 1) active balance
|
||||
* 2) destination numa is preferred
|
||||
* 3) task is cache cold, or
|
||||
* 4) too many balance attempts have failed.
|
||||
*/
|
||||
if (env->flags & LBF_ACTIVE_LB)
|
||||
return 1;
|
||||
|
||||
tsk_cache_hot = migrate_degrades_locality(p, env);
|
||||
if (tsk_cache_hot == -1)
|
||||
tsk_cache_hot = task_hot(p, env);
|
||||
|
@ -7659,6 +7742,15 @@ static int detach_tasks(struct lb_env *env)
|
|||
|
||||
lockdep_assert_held(&env->src_rq->lock);
|
||||
|
||||
/*
|
||||
* Source run queue has been emptied by another CPU, clear
|
||||
* LBF_ALL_PINNED flag as we will not test any task.
|
||||
*/
|
||||
if (env->src_rq->nr_running <= 1) {
|
||||
env->flags &= ~LBF_ALL_PINNED;
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (env->imbalance <= 0)
|
||||
return 0;
|
||||
|
||||
|
@ -7708,8 +7800,7 @@ static int detach_tasks(struct lb_env *env)
|
|||
* scheduler fails to find a good waiting task to
|
||||
* migrate.
|
||||
*/
|
||||
|
||||
if ((load >> env->sd->nr_balance_failed) > env->imbalance)
|
||||
if (shr_bound(load, env->sd->nr_balance_failed) > env->imbalance)
|
||||
goto next;
|
||||
|
||||
env->imbalance -= load;
|
||||
|
@ -7854,16 +7945,20 @@ static inline bool others_have_blocked(struct rq *rq)
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline void update_blocked_load_tick(struct rq *rq)
|
||||
{
|
||||
WRITE_ONCE(rq->last_blocked_load_update_tick, jiffies);
|
||||
}
|
||||
|
||||
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked)
|
||||
{
|
||||
rq->last_blocked_load_update_tick = jiffies;
|
||||
|
||||
if (!has_blocked)
|
||||
rq->has_blocked_load = 0;
|
||||
}
|
||||
#else
|
||||
static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) { return false; }
|
||||
static inline bool others_have_blocked(struct rq *rq) { return false; }
|
||||
static inline void update_blocked_load_tick(struct rq *rq) {}
|
||||
static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {}
|
||||
#endif
|
||||
|
||||
|
@ -8024,6 +8119,7 @@ static void update_blocked_averages(int cpu)
|
|||
struct rq_flags rf;
|
||||
|
||||
rq_lock_irqsave(rq, &rf);
|
||||
update_blocked_load_tick(rq);
|
||||
update_rq_clock(rq);
|
||||
|
||||
decayed |= __update_blocked_others(rq, &done);
|
||||
|
@ -8311,26 +8407,6 @@ group_is_overloaded(unsigned int imbalance_pct, struct sg_lb_stats *sgs)
|
|||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
|
||||
* per-CPU capacity than sched_group ref.
|
||||
*/
|
||||
static inline bool
|
||||
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
||||
{
|
||||
return fits_capacity(sg->sgc->min_capacity, ref->sgc->min_capacity);
|
||||
}
|
||||
|
||||
/*
|
||||
* group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
|
||||
* per-CPU capacity_orig than sched_group ref.
|
||||
*/
|
||||
static inline bool
|
||||
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
||||
{
|
||||
return fits_capacity(sg->sgc->max_capacity, ref->sgc->max_capacity);
|
||||
}
|
||||
|
||||
static inline enum
|
||||
group_type group_classify(unsigned int imbalance_pct,
|
||||
struct sched_group *group,
|
||||
|
@ -8354,28 +8430,6 @@ group_type group_classify(unsigned int imbalance_pct,
|
|||
return group_has_spare;
|
||||
}
|
||||
|
||||
static bool update_nohz_stats(struct rq *rq, bool force)
|
||||
{
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
unsigned int cpu = rq->cpu;
|
||||
|
||||
if (!rq->has_blocked_load)
|
||||
return false;
|
||||
|
||||
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
|
||||
return false;
|
||||
|
||||
if (!force && !time_after(jiffies, rq->last_blocked_load_update_tick))
|
||||
return true;
|
||||
|
||||
update_blocked_averages(cpu);
|
||||
|
||||
return rq->has_blocked_load;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
|
||||
* @env: The load balancing environment.
|
||||
|
@ -8397,9 +8451,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|||
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
|
||||
struct rq *rq = cpu_rq(i);
|
||||
|
||||
if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
|
||||
env->flags |= LBF_NOHZ_AGAIN;
|
||||
|
||||
sgs->group_load += cpu_load(rq);
|
||||
sgs->group_util += cpu_util(i);
|
||||
sgs->group_runnable += cpu_runnable(rq);
|
||||
|
@ -8489,7 +8540,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|||
* internally or be covered by avg_load imbalance (eventually).
|
||||
*/
|
||||
if (sgs->group_type == group_misfit_task &&
|
||||
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
|
||||
(!capacity_greater(capacity_of(env->dst_cpu), sg->sgc->max_capacity) ||
|
||||
sds->local_stat.group_type != group_has_spare))
|
||||
return false;
|
||||
|
||||
|
@ -8573,7 +8624,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|||
*/
|
||||
if ((env->sd->flags & SD_ASYM_CPUCAPACITY) &&
|
||||
(sgs->group_type <= group_fully_busy) &&
|
||||
(group_smaller_min_cpu_capacity(sds->local, sg)))
|
||||
(capacity_greater(sg->sgc->min_capacity, capacity_of(env->dst_cpu))))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
@ -8940,11 +8991,6 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|||
struct sg_lb_stats tmp_sgs;
|
||||
int sg_status = 0;
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
|
||||
env->flags |= LBF_NOHZ_STATS;
|
||||
#endif
|
||||
|
||||
do {
|
||||
struct sg_lb_stats *sgs = &tmp_sgs;
|
||||
int local_group;
|
||||
|
@ -8981,14 +9027,6 @@ next_group:
|
|||
/* Tag domain that child domain prefers tasks go to siblings first */
|
||||
sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
if ((env->flags & LBF_NOHZ_AGAIN) &&
|
||||
cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd))) {
|
||||
|
||||
WRITE_ONCE(nohz.next_blocked,
|
||||
jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD));
|
||||
}
|
||||
#endif
|
||||
|
||||
if (env->sd->flags & SD_NUMA)
|
||||
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
|
||||
|
@ -9386,7 +9424,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|||
* average load.
|
||||
*/
|
||||
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
|
||||
capacity_of(env->dst_cpu) < capacity &&
|
||||
!capacity_greater(capacity_of(env->dst_cpu), capacity) &&
|
||||
nr_running == 1)
|
||||
continue;
|
||||
|
||||
|
@ -9676,7 +9714,7 @@ more_balance:
|
|||
* load to given_cpu. In rare situations, this may cause
|
||||
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
|
||||
* _independently_ and at _same_ time to move some load to
|
||||
* given_cpu) causing exceess load to be moved to given_cpu.
|
||||
* given_cpu) causing excess load to be moved to given_cpu.
|
||||
* This however should not happen so much in practice and
|
||||
* moreover subsequent load balance cycles should correct the
|
||||
* excess load moved.
|
||||
|
@ -9776,9 +9814,6 @@ more_balance:
|
|||
active_load_balance_cpu_stop, busiest,
|
||||
&busiest->active_balance_work);
|
||||
}
|
||||
|
||||
/* We've kicked active balancing, force task migration. */
|
||||
sd->nr_balance_failed = sd->cache_nice_tries+1;
|
||||
}
|
||||
} else {
|
||||
sd->nr_balance_failed = 0;
|
||||
|
@ -9820,7 +9855,7 @@ out_one_pinned:
|
|||
/*
|
||||
* newidle_balance() disregards balance intervals, so we could
|
||||
* repeatedly reach this code, which would lead to balance_interval
|
||||
* skyrocketting in a short amount of time. Skip the balance_interval
|
||||
* skyrocketing in a short amount of time. Skip the balance_interval
|
||||
* increase logic to avoid that.
|
||||
*/
|
||||
if (env.idle == CPU_NEWLY_IDLE)
|
||||
|
@ -9928,13 +9963,7 @@ static int active_load_balance_cpu_stop(void *data)
|
|||
.src_cpu = busiest_rq->cpu,
|
||||
.src_rq = busiest_rq,
|
||||
.idle = CPU_IDLE,
|
||||
/*
|
||||
* can_migrate_task() doesn't need to compute new_dst_cpu
|
||||
* for active balancing. Since we have CPU_IDLE, but no
|
||||
* @dst_grpmask we need to make that test go away with lying
|
||||
* about DST_PINNED.
|
||||
*/
|
||||
.flags = LBF_DST_PINNED,
|
||||
.flags = LBF_ACTIVE_LB,
|
||||
};
|
||||
|
||||
schedstat_inc(sd->alb_count);
|
||||
|
@ -10061,22 +10090,9 @@ out:
|
|||
* When the cpu is attached to null domain for ex, it will not be
|
||||
* updated.
|
||||
*/
|
||||
if (likely(update_next_balance)) {
|
||||
if (likely(update_next_balance))
|
||||
rq->next_balance = next_balance;
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
/*
|
||||
* If this CPU has been elected to perform the nohz idle
|
||||
* balance. Other idle CPUs have already rebalanced with
|
||||
* nohz_idle_balance() and nohz.next_balance has been
|
||||
* updated accordingly. This CPU is now running the idle load
|
||||
* balance for itself and we need to update the
|
||||
* nohz.next_balance accordingly.
|
||||
*/
|
||||
if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
|
||||
nohz.next_balance = rq->next_balance;
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
static inline int on_null_domain(struct rq *rq)
|
||||
|
@ -10368,14 +10384,30 @@ out:
|
|||
WRITE_ONCE(nohz.has_blocked, 1);
|
||||
}
|
||||
|
||||
static bool update_nohz_stats(struct rq *rq)
|
||||
{
|
||||
unsigned int cpu = rq->cpu;
|
||||
|
||||
if (!rq->has_blocked_load)
|
||||
return false;
|
||||
|
||||
if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
|
||||
return false;
|
||||
|
||||
if (!time_after(jiffies, READ_ONCE(rq->last_blocked_load_update_tick)))
|
||||
return true;
|
||||
|
||||
update_blocked_averages(cpu);
|
||||
|
||||
return rq->has_blocked_load;
|
||||
}
|
||||
|
||||
/*
|
||||
* Internal function that runs load balance for all idle cpus. The load balance
|
||||
* can be a simple update of blocked load or a complete load balance with
|
||||
* tasks movement depending of flags.
|
||||
* The function returns false if the loop has stopped before running
|
||||
* through all idle CPUs.
|
||||
*/
|
||||
static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
||||
static void _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
||||
enum cpu_idle_type idle)
|
||||
{
|
||||
/* Earliest time when we have to do rebalance again */
|
||||
|
@ -10385,7 +10417,6 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
|||
int update_next_balance = 0;
|
||||
int this_cpu = this_rq->cpu;
|
||||
int balance_cpu;
|
||||
int ret = false;
|
||||
struct rq *rq;
|
||||
|
||||
SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
|
||||
|
@ -10406,8 +10437,12 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
|||
*/
|
||||
smp_mb();
|
||||
|
||||
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
|
||||
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
|
||||
/*
|
||||
* Start with the next CPU after this_cpu so we will end with this_cpu and let a
|
||||
* chance for other idle cpu to pull load.
|
||||
*/
|
||||
for_each_cpu_wrap(balance_cpu, nohz.idle_cpus_mask, this_cpu+1) {
|
||||
if (!idle_cpu(balance_cpu))
|
||||
continue;
|
||||
|
||||
/*
|
||||
|
@ -10422,7 +10457,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
|||
|
||||
rq = cpu_rq(balance_cpu);
|
||||
|
||||
has_blocked_load |= update_nohz_stats(rq, true);
|
||||
has_blocked_load |= update_nohz_stats(rq);
|
||||
|
||||
/*
|
||||
* If time for next balance is due,
|
||||
|
@ -10453,27 +10488,13 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
|
|||
if (likely(update_next_balance))
|
||||
nohz.next_balance = next_balance;
|
||||
|
||||
/* Newly idle CPU doesn't need an update */
|
||||
if (idle != CPU_NEWLY_IDLE) {
|
||||
update_blocked_averages(this_cpu);
|
||||
has_blocked_load |= this_rq->has_blocked_load;
|
||||
}
|
||||
|
||||
if (flags & NOHZ_BALANCE_KICK)
|
||||
rebalance_domains(this_rq, CPU_IDLE);
|
||||
|
||||
WRITE_ONCE(nohz.next_blocked,
|
||||
now + msecs_to_jiffies(LOAD_AVG_PERIOD));
|
||||
|
||||
/* The full idle balance loop has been done */
|
||||
ret = true;
|
||||
|
||||
abort:
|
||||
/* There is still blocked load, enable periodic update */
|
||||
if (has_blocked_load)
|
||||
WRITE_ONCE(nohz.has_blocked, 1);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -10497,6 +10518,24 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
|
|||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if we need to run the ILB for updating blocked load before entering
|
||||
* idle state.
|
||||
*/
|
||||
void nohz_run_idle_balance(int cpu)
|
||||
{
|
||||
unsigned int flags;
|
||||
|
||||
flags = atomic_fetch_andnot(NOHZ_NEWILB_KICK, nohz_flags(cpu));
|
||||
|
||||
/*
|
||||
* Update the blocked load only if no SCHED_SOFTIRQ is about to happen
|
||||
* (ie NOHZ_STATS_KICK set) and will do the same.
|
||||
*/
|
||||
if ((flags == NOHZ_NEWILB_KICK) && !need_resched())
|
||||
_nohz_idle_balance(cpu_rq(cpu), NOHZ_STATS_KICK, CPU_IDLE);
|
||||
}
|
||||
|
||||
static void nohz_newidle_balance(struct rq *this_rq)
|
||||
{
|
||||
int this_cpu = this_rq->cpu;
|
||||
|
@ -10517,16 +10556,11 @@ static void nohz_newidle_balance(struct rq *this_rq)
|
|||
time_before(jiffies, READ_ONCE(nohz.next_blocked)))
|
||||
return;
|
||||
|
||||
raw_spin_unlock(&this_rq->lock);
|
||||
/*
|
||||
* This CPU is going to be idle and blocked load of idle CPUs
|
||||
* need to be updated. Run the ilb locally as it is a good
|
||||
* candidate for ilb instead of waking up another idle CPU.
|
||||
* Kick an normal ilb if we failed to do the update.
|
||||
* Set the need to trigger ILB in order to update blocked load
|
||||
* before entering idle state.
|
||||
*/
|
||||
if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE))
|
||||
kick_ilb(NOHZ_STATS_KICK);
|
||||
raw_spin_lock(&this_rq->lock);
|
||||
atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu));
|
||||
}
|
||||
|
||||
#else /* !CONFIG_NO_HZ_COMMON */
|
||||
|
@ -10587,8 +10621,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
|
|||
update_next_balance(sd, &next_balance);
|
||||
rcu_read_unlock();
|
||||
|
||||
nohz_newidle_balance(this_rq);
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -10635,7 +10667,6 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
|
|||
if (curr_cost > this_rq->max_idle_balance_cost)
|
||||
this_rq->max_idle_balance_cost = curr_cost;
|
||||
|
||||
out:
|
||||
/*
|
||||
* While browsing the domains, we released the rq lock, a task could
|
||||
* have been enqueued in the meantime. Since we're not going idle,
|
||||
|
@ -10644,16 +10675,19 @@ out:
|
|||
if (this_rq->cfs.h_nr_running && !pulled_task)
|
||||
pulled_task = 1;
|
||||
|
||||
/* Move the next balance forward */
|
||||
if (time_after(this_rq->next_balance, next_balance))
|
||||
this_rq->next_balance = next_balance;
|
||||
|
||||
/* Is there a task of a high priority class? */
|
||||
if (this_rq->nr_running != this_rq->cfs.h_nr_running)
|
||||
pulled_task = -1;
|
||||
|
||||
out:
|
||||
/* Move the next balance forward */
|
||||
if (time_after(this_rq->next_balance, next_balance))
|
||||
this_rq->next_balance = next_balance;
|
||||
|
||||
if (pulled_task)
|
||||
this_rq->idle_stamp = 0;
|
||||
else
|
||||
nohz_newidle_balance(this_rq);
|
||||
|
||||
rq_repin_lock(this_rq, rf);
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ SCHED_FEAT(NEXT_BUDDY, false)
|
|||
SCHED_FEAT(LAST_BUDDY, true)
|
||||
|
||||
/*
|
||||
* Consider buddies to be cache hot, decreases the likelyness of a
|
||||
* Consider buddies to be cache hot, decreases the likeliness of a
|
||||
* cache buddy being migrated away, increases cache locality.
|
||||
*/
|
||||
SCHED_FEAT(CACHE_HOT_BUDDY, true)
|
||||
|
@ -90,3 +90,8 @@ SCHED_FEAT(WA_BIAS, true)
|
|||
*/
|
||||
SCHED_FEAT(UTIL_EST, true)
|
||||
SCHED_FEAT(UTIL_EST_FASTUP, true)
|
||||
|
||||
SCHED_FEAT(LATENCY_WARN, false)
|
||||
|
||||
SCHED_FEAT(ALT_PERIOD, true)
|
||||
SCHED_FEAT(BASE_SLICE, true)
|
||||
|
|
|
@ -163,7 +163,7 @@ static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
|
|||
*
|
||||
* NOTE: no locks or semaphores should be used here
|
||||
*
|
||||
* On archs that support TIF_POLLING_NRFLAG, is called with polling
|
||||
* On architectures that support TIF_POLLING_NRFLAG, is called with polling
|
||||
* set, and it returns with polling set. If it ever stops polling, it
|
||||
* must clear the polling bit.
|
||||
*/
|
||||
|
@ -199,7 +199,7 @@ static void cpuidle_idle_call(void)
|
|||
* Suspend-to-idle ("s2idle") is a system state in which all user space
|
||||
* has been frozen, all I/O devices have been suspended and the only
|
||||
* activity happens here and in interrupts (if any). In that case bypass
|
||||
* the cpuidle governor and go stratight for the deepest idle state
|
||||
* the cpuidle governor and go straight for the deepest idle state
|
||||
* available. Possibly also suspend the local tick and the entire
|
||||
* timekeeping to prevent timer interrupts from kicking us out of idle
|
||||
* until a proper wakeup interrupt happens.
|
||||
|
@ -261,6 +261,12 @@ exit_idle:
|
|||
static void do_idle(void)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
/*
|
||||
* Check if we need to update blocked load
|
||||
*/
|
||||
nohz_run_idle_balance(cpu);
|
||||
|
||||
/*
|
||||
* If the arch has a polling bit, we maintain an invariant:
|
||||
*
|
||||
|
|
|
@ -189,7 +189,7 @@ calc_load_n(unsigned long load, unsigned long exp,
|
|||
* w:0 1 1 0 0 1 1 0 0
|
||||
*
|
||||
* This ensures we'll fold the old NO_HZ contribution in this window while
|
||||
* accumlating the new one.
|
||||
* accumulating the new one.
|
||||
*
|
||||
* - When we wake up from NO_HZ during the window, we push up our
|
||||
* contribution, since we effectively move our sample point to a known
|
||||
|
|
|
@ -133,7 +133,7 @@ accumulate_sum(u64 delta, struct sched_avg *sa,
|
|||
* runnable = running = 0;
|
||||
*
|
||||
* clause from ___update_load_sum(); this results in
|
||||
* the below usage of @contrib to dissapear entirely,
|
||||
* the below usage of @contrib to disappear entirely,
|
||||
* so no point in calculating it.
|
||||
*/
|
||||
contrib = __accumulate_pelt_segments(periods,
|
||||
|
|
|
@ -130,7 +130,7 @@ static inline void update_idle_rq_clock_pelt(struct rq *rq)
|
|||
* Reflecting stolen time makes sense only if the idle
|
||||
* phase would be present at max capacity. As soon as the
|
||||
* utilization of a rq has reached the maximum value, it is
|
||||
* considered as an always runnig rq without idle time to
|
||||
* considered as an always running rq without idle time to
|
||||
* steal. This potential idle time is considered as lost in
|
||||
* this case. We keep track of this lost idle time compare to
|
||||
* rq's clock_task.
|
||||
|
|
|
@ -34,7 +34,10 @@
|
|||
* delayed on that resource such that nobody is advancing and the CPU
|
||||
* goes idle. This leaves both workload and CPU unproductive.
|
||||
*
|
||||
* (Naturally, the FULL state doesn't exist for the CPU resource.)
|
||||
* Naturally, the FULL state doesn't exist for the CPU resource at the
|
||||
* system level, but exist at the cgroup level, means all non-idle tasks
|
||||
* in a cgroup are delayed on the CPU resource which used by others outside
|
||||
* of the cgroup or throttled by the cgroup cpu.max configuration.
|
||||
*
|
||||
* SOME = nr_delayed_tasks != 0
|
||||
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
|
||||
|
@ -59,7 +62,7 @@
|
|||
* states, we would have to conclude a CPU SOME pressure number of
|
||||
* 100%, since *somebody* is waiting on a runqueue at all
|
||||
* times. However, that is clearly not the amount of contention the
|
||||
* workload is experiencing: only one out of 256 possible exceution
|
||||
* workload is experiencing: only one out of 256 possible execution
|
||||
* threads will be contended at any given time, or about 0.4%.
|
||||
*
|
||||
* Conversely, consider a scenario of 4 tasks and 4 CPUs where at any
|
||||
|
@ -73,7 +76,7 @@
|
|||
* we have to base our calculation on the number of non-idle tasks in
|
||||
* conjunction with the number of available CPUs, which is the number
|
||||
* of potential execution threads. SOME becomes then the proportion of
|
||||
* delayed tasks to possibe threads, and FULL is the share of possible
|
||||
* delayed tasks to possible threads, and FULL is the share of possible
|
||||
* threads that are unproductive due to delays:
|
||||
*
|
||||
* threads = min(nr_nonidle_tasks, nr_cpus)
|
||||
|
@ -216,15 +219,17 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
|||
{
|
||||
switch (state) {
|
||||
case PSI_IO_SOME:
|
||||
return tasks[NR_IOWAIT];
|
||||
return unlikely(tasks[NR_IOWAIT]);
|
||||
case PSI_IO_FULL:
|
||||
return tasks[NR_IOWAIT] && !tasks[NR_RUNNING];
|
||||
return unlikely(tasks[NR_IOWAIT] && !tasks[NR_RUNNING]);
|
||||
case PSI_MEM_SOME:
|
||||
return tasks[NR_MEMSTALL];
|
||||
return unlikely(tasks[NR_MEMSTALL]);
|
||||
case PSI_MEM_FULL:
|
||||
return tasks[NR_MEMSTALL] && !tasks[NR_RUNNING];
|
||||
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
|
||||
case PSI_CPU_SOME:
|
||||
return tasks[NR_RUNNING] > tasks[NR_ONCPU];
|
||||
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
||||
case PSI_CPU_FULL:
|
||||
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
|
||||
case PSI_NONIDLE:
|
||||
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
||||
tasks[NR_RUNNING];
|
||||
|
@ -441,7 +446,7 @@ static void psi_avgs_work(struct work_struct *work)
|
|||
mutex_unlock(&group->avgs_lock);
|
||||
}
|
||||
|
||||
/* Trigger tracking window manupulations */
|
||||
/* Trigger tracking window manipulations */
|
||||
static void window_reset(struct psi_window *win, u64 now, u64 value,
|
||||
u64 prev_growth)
|
||||
{
|
||||
|
@ -639,13 +644,10 @@ static void poll_timer_fn(struct timer_list *t)
|
|||
wake_up_interruptible(&group->poll_wait);
|
||||
}
|
||||
|
||||
static void record_times(struct psi_group_cpu *groupc, int cpu,
|
||||
bool memstall_tick)
|
||||
static void record_times(struct psi_group_cpu *groupc, u64 now)
|
||||
{
|
||||
u32 delta;
|
||||
u64 now;
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
delta = now - groupc->state_start;
|
||||
groupc->state_start = now;
|
||||
|
||||
|
@ -659,34 +661,20 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
|
|||
groupc->times[PSI_MEM_SOME] += delta;
|
||||
if (groupc->state_mask & (1 << PSI_MEM_FULL))
|
||||
groupc->times[PSI_MEM_FULL] += delta;
|
||||
else if (memstall_tick) {
|
||||
u32 sample;
|
||||
/*
|
||||
* Since we care about lost potential, a
|
||||
* memstall is FULL when there are no other
|
||||
* working tasks, but also when the CPU is
|
||||
* actively reclaiming and nothing productive
|
||||
* could run even if it were runnable.
|
||||
*
|
||||
* When the timer tick sees a reclaiming CPU,
|
||||
* regardless of runnable tasks, sample a FULL
|
||||
* tick (or less if it hasn't been a full tick
|
||||
* since the last state change).
|
||||
*/
|
||||
sample = min(delta, (u32)jiffies_to_nsecs(1));
|
||||
groupc->times[PSI_MEM_FULL] += sample;
|
||||
}
|
||||
}
|
||||
|
||||
if (groupc->state_mask & (1 << PSI_CPU_SOME))
|
||||
if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
|
||||
groupc->times[PSI_CPU_SOME] += delta;
|
||||
if (groupc->state_mask & (1 << PSI_CPU_FULL))
|
||||
groupc->times[PSI_CPU_FULL] += delta;
|
||||
}
|
||||
|
||||
if (groupc->state_mask & (1 << PSI_NONIDLE))
|
||||
groupc->times[PSI_NONIDLE] += delta;
|
||||
}
|
||||
|
||||
static void psi_group_change(struct psi_group *group, int cpu,
|
||||
unsigned int clear, unsigned int set,
|
||||
unsigned int clear, unsigned int set, u64 now,
|
||||
bool wake_clock)
|
||||
{
|
||||
struct psi_group_cpu *groupc;
|
||||
|
@ -706,19 +694,20 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
*/
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
||||
record_times(groupc, cpu, false);
|
||||
record_times(groupc, now);
|
||||
|
||||
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
|
||||
if (!(m & (1 << t)))
|
||||
continue;
|
||||
if (groupc->tasks[t] == 0 && !psi_bug) {
|
||||
if (groupc->tasks[t]) {
|
||||
groupc->tasks[t]--;
|
||||
} else if (!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||
cpu, t, groupc->tasks[0],
|
||||
groupc->tasks[1], groupc->tasks[2],
|
||||
groupc->tasks[3], clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
groupc->tasks[t]--;
|
||||
}
|
||||
|
||||
for (t = 0; set; set &= ~(1 << t), t++)
|
||||
|
@ -730,6 +719,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
|||
if (test_state(groupc->tasks, s))
|
||||
state_mask |= (1 << s);
|
||||
}
|
||||
|
||||
/*
|
||||
* Since we care about lost potential, a memstall is FULL
|
||||
* when there are no other working tasks, but also when
|
||||
* the CPU is actively reclaiming and nothing productive
|
||||
* could run even if it were runnable. So when the current
|
||||
* task in a cgroup is in_memstall, the corresponding groupc
|
||||
* on that cpu is in PSI_MEM_FULL state.
|
||||
*/
|
||||
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
|
||||
state_mask |= (1 << PSI_MEM_FULL);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
|
@ -786,12 +787,14 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
|||
struct psi_group *group;
|
||||
bool wake_clock = true;
|
||||
void *iter = NULL;
|
||||
u64 now;
|
||||
|
||||
if (!task->pid)
|
||||
return;
|
||||
|
||||
psi_flags_change(task, clear, set);
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
/*
|
||||
* Periodic aggregation shuts off if there is a period of no
|
||||
* task changes, so we wake it back up if necessary. However,
|
||||
|
@ -804,7 +807,7 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
|||
wake_clock = false;
|
||||
|
||||
while ((group = iterate_groups(task, &iter)))
|
||||
psi_group_change(group, cpu, clear, set, wake_clock);
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
}
|
||||
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
|
@ -813,56 +816,61 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
|||
struct psi_group *group, *common = NULL;
|
||||
int cpu = task_cpu(prev);
|
||||
void *iter;
|
||||
u64 now = cpu_clock(cpu);
|
||||
|
||||
if (next->pid) {
|
||||
bool identical_state;
|
||||
|
||||
psi_flags_change(next, 0, TSK_ONCPU);
|
||||
/*
|
||||
* When moving state between tasks, the group that
|
||||
* contains them both does not change: we can stop
|
||||
* updating the tree once we reach the first common
|
||||
* ancestor. Iterate @next's ancestors until we
|
||||
* encounter @prev's state.
|
||||
* When switching between tasks that have an identical
|
||||
* runtime state, the cgroup that contains both tasks
|
||||
* runtime state, the cgroup that contains both tasks
|
||||
* we reach the first common ancestor. Iterate @next's
|
||||
* ancestors only until we encounter @prev's ONCPU.
|
||||
*/
|
||||
identical_state = prev->psi_flags == next->psi_flags;
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(next, &iter))) {
|
||||
if (per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
|
||||
if (identical_state &&
|
||||
per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
|
||||
common = group;
|
||||
break;
|
||||
}
|
||||
|
||||
psi_group_change(group, cpu, 0, TSK_ONCPU, true);
|
||||
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* If this is a voluntary sleep, dequeue will have taken care
|
||||
* of the outgoing TSK_ONCPU alongside TSK_RUNNING already. We
|
||||
* only need to deal with it during preemption.
|
||||
*/
|
||||
if (sleep)
|
||||
return;
|
||||
|
||||
if (prev->pid) {
|
||||
psi_flags_change(prev, TSK_ONCPU, 0);
|
||||
int clear = TSK_ONCPU, set = 0;
|
||||
|
||||
/*
|
||||
* When we're going to sleep, psi_dequeue() lets us handle
|
||||
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
|
||||
* with TSK_ONCPU and save walking common ancestors twice.
|
||||
*/
|
||||
if (sleep) {
|
||||
clear |= TSK_RUNNING;
|
||||
if (prev->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
}
|
||||
|
||||
psi_flags_change(prev, clear, set);
|
||||
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(prev, &iter)) && group != common)
|
||||
psi_group_change(group, cpu, TSK_ONCPU, 0, true);
|
||||
}
|
||||
}
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
|
||||
void psi_memstall_tick(struct task_struct *task, int cpu)
|
||||
{
|
||||
struct psi_group *group;
|
||||
void *iter = NULL;
|
||||
|
||||
while ((group = iterate_groups(task, &iter))) {
|
||||
struct psi_group_cpu *groupc;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
record_times(groupc, cpu, true);
|
||||
write_seqcount_end(&groupc->seq);
|
||||
/*
|
||||
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
|
||||
* with dequeuing too, finish that for the rest of the hierarchy.
|
||||
*/
|
||||
if (sleep) {
|
||||
clear &= ~TSK_ONCPU;
|
||||
for (; group; group = iterate_groups(prev, &iter))
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1018,7 +1026,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
|||
group->avg_next_update = update_averages(group, now);
|
||||
mutex_unlock(&group->avgs_lock);
|
||||
|
||||
for (full = 0; full < 2 - (res == PSI_CPU); full++) {
|
||||
for (full = 0; full < 2; full++) {
|
||||
unsigned long avg[3];
|
||||
u64 total;
|
||||
int w;
|
||||
|
@ -1054,19 +1062,27 @@ static int psi_cpu_show(struct seq_file *m, void *v)
|
|||
return psi_show(m, &psi_system, PSI_CPU);
|
||||
}
|
||||
|
||||
static int psi_open(struct file *file, int (*psi_show)(struct seq_file *, void *))
|
||||
{
|
||||
if (file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
|
||||
return -EPERM;
|
||||
|
||||
return single_open(file, psi_show, NULL);
|
||||
}
|
||||
|
||||
static int psi_io_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, psi_io_show, NULL);
|
||||
return psi_open(file, psi_io_show);
|
||||
}
|
||||
|
||||
static int psi_memory_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, psi_memory_show, NULL);
|
||||
return psi_open(file, psi_memory_show);
|
||||
}
|
||||
|
||||
static int psi_cpu_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return single_open(file, psi_cpu_show, NULL);
|
||||
return psi_open(file, psi_cpu_show);
|
||||
}
|
||||
|
||||
struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
|
@ -1346,9 +1362,9 @@ static int __init psi_proc_init(void)
|
|||
{
|
||||
if (psi_enable) {
|
||||
proc_mkdir("pressure", NULL);
|
||||
proc_create("pressure/io", 0, NULL, &psi_io_proc_ops);
|
||||
proc_create("pressure/memory", 0, NULL, &psi_memory_proc_ops);
|
||||
proc_create("pressure/cpu", 0, NULL, &psi_cpu_proc_ops);
|
||||
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
|
||||
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
|
||||
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -700,7 +700,7 @@ static void do_balance_runtime(struct rt_rq *rt_rq)
|
|||
/*
|
||||
* Either all rqs have inf runtime and there's nothing to steal
|
||||
* or __disable_runtime() below sets a specific rq to inf to
|
||||
* indicate its been disabled and disalow stealing.
|
||||
* indicate its been disabled and disallow stealing.
|
||||
*/
|
||||
if (iter->rt_runtime == RUNTIME_INF)
|
||||
goto next;
|
||||
|
@ -1998,7 +1998,7 @@ static void push_rt_tasks(struct rq *rq)
|
|||
*
|
||||
* Each root domain has its own irq work function that can iterate over
|
||||
* all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
|
||||
* tassk must be checked if there's one or many CPUs that are lowering
|
||||
* task must be checked if there's one or many CPUs that are lowering
|
||||
* their priority, there's a single irq work iterator that will try to
|
||||
* push off RT tasks that are waiting to run.
|
||||
*
|
||||
|
@ -2216,7 +2216,7 @@ static void pull_rt_task(struct rq *this_rq)
|
|||
/*
|
||||
* There's a chance that p is higher in priority
|
||||
* than what's currently running on its CPU.
|
||||
* This is just that p is wakeing up and hasn't
|
||||
* This is just that p is waking up and hasn't
|
||||
* had a chance to schedule. We only pull
|
||||
* p if it is lower in priority than the
|
||||
* current task on the run queue
|
||||
|
|
|
@ -36,6 +36,7 @@
|
|||
#include <uapi/linux/sched/types.h>
|
||||
|
||||
#include <linux/binfmts.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/context_tracking.h>
|
||||
|
@ -57,6 +58,7 @@
|
|||
#include <linux/prefetch.h>
|
||||
#include <linux/profile.h>
|
||||
#include <linux/psi.h>
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/stop_machine.h>
|
||||
|
@ -204,6 +206,13 @@ static inline void update_avg(u64 *avg, u64 sample)
|
|||
*avg += diff / 8;
|
||||
}
|
||||
|
||||
/*
|
||||
* Shifting a value by an exponent greater *or equal* to the size of said value
|
||||
* is UB; cap at size-1.
|
||||
*/
|
||||
#define shr_bound(val, shift) \
|
||||
(val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1))
|
||||
|
||||
/*
|
||||
* !! For sched_setattr_nocheck() (kernel) only !!
|
||||
*
|
||||
|
@ -963,6 +972,11 @@ struct rq {
|
|||
|
||||
atomic_t nr_iowait;
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
u64 last_seen_need_resched_ns;
|
||||
int ticks_without_resched;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_MEMBARRIER
|
||||
int membarrier_state;
|
||||
#endif
|
||||
|
@ -975,7 +989,6 @@ struct rq {
|
|||
unsigned long cpu_capacity_orig;
|
||||
|
||||
struct callback_head *balance_callback;
|
||||
unsigned char balance_push;
|
||||
|
||||
unsigned char nohz_idle_balance;
|
||||
unsigned char idle_balance;
|
||||
|
@ -1147,7 +1160,7 @@ static inline u64 __rq_clock_broken(struct rq *rq)
|
|||
*
|
||||
* if (rq-clock_update_flags >= RQCF_UPDATED)
|
||||
*
|
||||
* to check if %RQCF_UPADTED is set. It'll never be shifted more than
|
||||
* to check if %RQCF_UPDATED is set. It'll never be shifted more than
|
||||
* one position though, because the next rq_unpin_lock() will shift it
|
||||
* back.
|
||||
*/
|
||||
|
@ -1206,7 +1219,7 @@ static inline void rq_clock_skip_update(struct rq *rq)
|
|||
|
||||
/*
|
||||
* See rt task throttling, which is the only time a skip
|
||||
* request is cancelled.
|
||||
* request is canceled.
|
||||
*/
|
||||
static inline void rq_clock_cancel_skipupdate(struct rq *rq)
|
||||
{
|
||||
|
@ -1545,22 +1558,20 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
|
|||
|
||||
extern int group_balance_cpu(struct sched_group *sg);
|
||||
|
||||
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
|
||||
void register_sched_domain_sysctl(void);
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
void update_sched_domain_debugfs(void);
|
||||
void dirty_sched_domain_sysctl(int cpu);
|
||||
void unregister_sched_domain_sysctl(void);
|
||||
#else
|
||||
static inline void register_sched_domain_sysctl(void)
|
||||
static inline void update_sched_domain_debugfs(void)
|
||||
{
|
||||
}
|
||||
static inline void dirty_sched_domain_sysctl(int cpu)
|
||||
{
|
||||
}
|
||||
static inline void unregister_sched_domain_sysctl(void)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
extern int sched_update_scaling(void);
|
||||
|
||||
extern void flush_smp_call_function_from_idle(void);
|
||||
|
||||
#else /* !CONFIG_SMP: */
|
||||
|
@ -1853,7 +1864,7 @@ struct sched_class {
|
|||
|
||||
/*
|
||||
* The switched_from() call is allowed to drop rq->lock, therefore we
|
||||
* cannot assume the switched_from/switched_to pair is serliazed by
|
||||
* cannot assume the switched_from/switched_to pair is serialized by
|
||||
* rq->lock. They are however serialized by p->pi_lock.
|
||||
*/
|
||||
void (*switched_from)(struct rq *this_rq, struct task_struct *task);
|
||||
|
@ -2358,7 +2369,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
|
|||
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
extern bool sched_debug_enabled;
|
||||
extern bool sched_debug_verbose;
|
||||
|
||||
extern void print_cfs_stats(struct seq_file *m, int cpu);
|
||||
extern void print_rt_stats(struct seq_file *m, int cpu);
|
||||
|
@ -2366,6 +2377,8 @@ extern void print_dl_stats(struct seq_file *m, int cpu);
|
|||
extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
|
||||
extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
|
||||
extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
|
||||
|
||||
extern void resched_latency_warn(int cpu, u64 latency);
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
extern void
|
||||
show_numa_stats(struct task_struct *p, struct seq_file *m);
|
||||
|
@ -2373,6 +2386,8 @@ extern void
|
|||
print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
|
||||
unsigned long tpf, unsigned long gsf, unsigned long gpf);
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
#else
|
||||
static inline void resched_latency_warn(int cpu, u64 latency) {}
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
|
||||
|
@ -2385,9 +2400,11 @@ extern void cfs_bandwidth_usage_dec(void);
|
|||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
#define NOHZ_BALANCE_KICK_BIT 0
|
||||
#define NOHZ_STATS_KICK_BIT 1
|
||||
#define NOHZ_NEWILB_KICK_BIT 2
|
||||
|
||||
#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT)
|
||||
#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT)
|
||||
#define NOHZ_NEWILB_KICK BIT(NOHZ_NEWILB_KICK_BIT)
|
||||
|
||||
#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK)
|
||||
|
||||
|
@ -2398,6 +2415,11 @@ extern void nohz_balance_exit_idle(struct rq *rq);
|
|||
static inline void nohz_balance_exit_idle(struct rq *rq) { }
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
|
||||
extern void nohz_run_idle_balance(int cpu);
|
||||
#else
|
||||
static inline void nohz_run_idle_balance(int cpu) { }
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static inline
|
||||
|
@ -2437,7 +2459,7 @@ DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
|
|||
|
||||
/*
|
||||
* Returns the irqtime minus the softirq time computed by ksoftirqd.
|
||||
* Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
|
||||
* Otherwise ksoftirqd's sum_exec_runtime is subtracted its own runtime
|
||||
* and never move forward.
|
||||
*/
|
||||
static inline u64 irq_time_read(int cpu)
|
||||
|
@ -2718,5 +2740,12 @@ static inline bool is_per_cpu_kthread(struct task_struct *p)
|
|||
}
|
||||
#endif
|
||||
|
||||
void swake_up_all_locked(struct swait_queue_head *q);
|
||||
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
|
||||
extern void swake_up_all_locked(struct swait_queue_head *q);
|
||||
extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
|
||||
|
||||
#ifdef CONFIG_PREEMPT_DYNAMIC
|
||||
extern int preempt_dynamic_mode;
|
||||
extern int sched_dynamic_mode(const char *str);
|
||||
extern void sched_dynamic_update(int mode);
|
||||
#endif
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
|
|||
}
|
||||
|
||||
/*
|
||||
* This itererator needs some explanation.
|
||||
* This iterator needs some explanation.
|
||||
* It returns 1 for the header position.
|
||||
* This means 2 is cpu 0.
|
||||
* In a hotplugged system some CPUs, including cpu 0, may be missing so we have
|
||||
|
|
|
@ -84,28 +84,24 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
|
|||
|
||||
static inline void psi_dequeue(struct task_struct *p, bool sleep)
|
||||
{
|
||||
int clear = TSK_RUNNING, set = 0;
|
||||
int clear = TSK_RUNNING;
|
||||
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (!sleep) {
|
||||
if (p->in_memstall)
|
||||
clear |= TSK_MEMSTALL;
|
||||
} else {
|
||||
/*
|
||||
* When a task sleeps, schedule() dequeues it before
|
||||
* switching to the next one. Merge the clearing of
|
||||
* TSK_RUNNING and TSK_ONCPU to save an unnecessary
|
||||
* psi_task_change() call in psi_sched_switch().
|
||||
*/
|
||||
clear |= TSK_ONCPU;
|
||||
/*
|
||||
* A voluntary sleep is a dequeue followed by a task switch. To
|
||||
* avoid walking all ancestors twice, psi_task_switch() handles
|
||||
* TSK_RUNNING and TSK_IOWAIT for us when it moves TSK_ONCPU.
|
||||
* Do nothing here.
|
||||
*/
|
||||
if (sleep)
|
||||
return;
|
||||
|
||||
if (p->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
}
|
||||
if (p->in_memstall)
|
||||
clear |= TSK_MEMSTALL;
|
||||
|
||||
psi_task_change(p, clear, set);
|
||||
psi_task_change(p, clear, 0);
|
||||
}
|
||||
|
||||
static inline void psi_ttwu_dequeue(struct task_struct *p)
|
||||
|
@ -144,14 +140,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
|
|||
psi_task_switch(prev, next, sleep);
|
||||
}
|
||||
|
||||
static inline void psi_task_tick(struct rq *rq)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return;
|
||||
|
||||
if (unlikely(rq->curr->in_memstall))
|
||||
psi_memstall_tick(rq->curr, cpu_of(rq));
|
||||
}
|
||||
#else /* CONFIG_PSI */
|
||||
static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
|
||||
static inline void psi_dequeue(struct task_struct *p, bool sleep) {}
|
||||
|
@ -159,7 +147,6 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
|||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
bool sleep) {}
|
||||
static inline void psi_task_tick(struct rq *rq) {}
|
||||
#endif /* CONFIG_PSI */
|
||||
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
|
|
|
@ -14,15 +14,15 @@ static cpumask_var_t sched_domains_tmpmask2;
|
|||
|
||||
static int __init sched_debug_setup(char *str)
|
||||
{
|
||||
sched_debug_enabled = true;
|
||||
sched_debug_verbose = true;
|
||||
|
||||
return 0;
|
||||
}
|
||||
early_param("sched_debug", sched_debug_setup);
|
||||
early_param("sched_verbose", sched_debug_setup);
|
||||
|
||||
static inline bool sched_debug(void)
|
||||
{
|
||||
return sched_debug_enabled;
|
||||
return sched_debug_verbose;
|
||||
}
|
||||
|
||||
#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
|
||||
|
@ -131,7 +131,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
|
|||
{
|
||||
int level = 0;
|
||||
|
||||
if (!sched_debug_enabled)
|
||||
if (!sched_debug_verbose)
|
||||
return;
|
||||
|
||||
if (!sd) {
|
||||
|
@ -152,7 +152,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
|
|||
}
|
||||
#else /* !CONFIG_SCHED_DEBUG */
|
||||
|
||||
# define sched_debug_enabled 0
|
||||
# define sched_debug_verbose 0
|
||||
# define sched_domain_debug(sd, cpu) do { } while (0)
|
||||
static inline bool sched_debug(void)
|
||||
{
|
||||
|
@ -723,35 +723,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
|
|||
for (tmp = sd; tmp; tmp = tmp->parent)
|
||||
numa_distance += !!(tmp->flags & SD_NUMA);
|
||||
|
||||
/*
|
||||
* FIXME: Diameter >=3 is misrepresented.
|
||||
*
|
||||
* Smallest diameter=3 topology is:
|
||||
*
|
||||
* node 0 1 2 3
|
||||
* 0: 10 20 30 40
|
||||
* 1: 20 10 20 30
|
||||
* 2: 30 20 10 20
|
||||
* 3: 40 30 20 10
|
||||
*
|
||||
* 0 --- 1 --- 2 --- 3
|
||||
*
|
||||
* NUMA-3 0-3 N/A N/A 0-3
|
||||
* groups: {0-2},{1-3} {1-3},{0-2}
|
||||
*
|
||||
* NUMA-2 0-2 0-3 0-3 1-3
|
||||
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
|
||||
*
|
||||
* NUMA-1 0-1 0-2 1-3 2-3
|
||||
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
|
||||
*
|
||||
* NUMA-0 0 1 2 3
|
||||
*
|
||||
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
|
||||
* group span isn't a subset of the domain span.
|
||||
*/
|
||||
WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n");
|
||||
|
||||
sched_domain_debug(sd, cpu);
|
||||
|
||||
rq_attach_root(rq, rd);
|
||||
|
@ -963,7 +934,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
|
|||
int cpu;
|
||||
|
||||
build_balance_mask(sd, sg, mask);
|
||||
cpu = cpumask_first_and(sched_group_span(sg), mask);
|
||||
cpu = cpumask_first(mask);
|
||||
|
||||
sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
|
||||
if (atomic_inc_return(&sg->sgc->ref) == 1)
|
||||
|
@ -982,6 +953,31 @@ static void init_overlap_sched_group(struct sched_domain *sd,
|
|||
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
|
||||
}
|
||||
|
||||
static struct sched_domain *
|
||||
find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling)
|
||||
{
|
||||
/*
|
||||
* The proper descendant would be the one whose child won't span out
|
||||
* of sd
|
||||
*/
|
||||
while (sibling->child &&
|
||||
!cpumask_subset(sched_domain_span(sibling->child),
|
||||
sched_domain_span(sd)))
|
||||
sibling = sibling->child;
|
||||
|
||||
/*
|
||||
* As we are referencing sgc across different topology level, we need
|
||||
* to go down to skip those sched_domains which don't contribute to
|
||||
* scheduling because they will be degenerated in cpu_attach_domain
|
||||
*/
|
||||
while (sibling->child &&
|
||||
cpumask_equal(sched_domain_span(sibling->child),
|
||||
sched_domain_span(sibling)))
|
||||
sibling = sibling->child;
|
||||
|
||||
return sibling;
|
||||
}
|
||||
|
||||
static int
|
||||
build_overlap_sched_groups(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
|
@ -1015,6 +1011,41 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
|
|||
if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* Usually we build sched_group by sibling's child sched_domain
|
||||
* But for machines whose NUMA diameter are 3 or above, we move
|
||||
* to build sched_group by sibling's proper descendant's child
|
||||
* domain because sibling's child sched_domain will span out of
|
||||
* the sched_domain being built as below.
|
||||
*
|
||||
* Smallest diameter=3 topology is:
|
||||
*
|
||||
* node 0 1 2 3
|
||||
* 0: 10 20 30 40
|
||||
* 1: 20 10 20 30
|
||||
* 2: 30 20 10 20
|
||||
* 3: 40 30 20 10
|
||||
*
|
||||
* 0 --- 1 --- 2 --- 3
|
||||
*
|
||||
* NUMA-3 0-3 N/A N/A 0-3
|
||||
* groups: {0-2},{1-3} {1-3},{0-2}
|
||||
*
|
||||
* NUMA-2 0-2 0-3 0-3 1-3
|
||||
* groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2}
|
||||
*
|
||||
* NUMA-1 0-1 0-2 1-3 2-3
|
||||
* groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2}
|
||||
*
|
||||
* NUMA-0 0 1 2 3
|
||||
*
|
||||
* The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the
|
||||
* group span isn't a subset of the domain span.
|
||||
*/
|
||||
if (sibling->child &&
|
||||
!cpumask_subset(sched_domain_span(sibling->child), span))
|
||||
sibling = find_descended_sibling(sd, sibling);
|
||||
|
||||
sg = build_group_from_child_sched_domain(sibling, cpu);
|
||||
if (!sg)
|
||||
goto fail;
|
||||
|
@ -1022,7 +1053,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
|
|||
sg_span = sched_group_span(sg);
|
||||
cpumask_or(covered, covered, sg_span);
|
||||
|
||||
init_overlap_sched_group(sd, sg);
|
||||
init_overlap_sched_group(sibling, sg);
|
||||
|
||||
if (!first)
|
||||
first = sg;
|
||||
|
@ -2110,7 +2141,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
|||
if (has_asym)
|
||||
static_branch_inc_cpuslocked(&sched_asym_cpucapacity);
|
||||
|
||||
if (rq && sched_debug_enabled) {
|
||||
if (rq && sched_debug_verbose) {
|
||||
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
|
||||
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
|
||||
}
|
||||
|
@ -2128,7 +2159,7 @@ static cpumask_var_t *doms_cur;
|
|||
/* Number of sched domains in 'doms_cur': */
|
||||
static int ndoms_cur;
|
||||
|
||||
/* Attribues of custom domains in 'doms_cur' */
|
||||
/* Attributes of custom domains in 'doms_cur' */
|
||||
static struct sched_domain_attr *dattr_cur;
|
||||
|
||||
/*
|
||||
|
@ -2192,7 +2223,6 @@ int sched_init_domains(const struct cpumask *cpu_map)
|
|||
doms_cur = &fallback_doms;
|
||||
cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
|
||||
err = build_sched_domains(doms_cur[0], NULL);
|
||||
register_sched_domain_sysctl();
|
||||
|
||||
return err;
|
||||
}
|
||||
|
@ -2267,9 +2297,6 @@ void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[],
|
|||
|
||||
lockdep_assert_held(&sched_domains_mutex);
|
||||
|
||||
/* Always unregister in case we don't destroy any domains: */
|
||||
unregister_sched_domain_sysctl();
|
||||
|
||||
/* Let the architecture update CPU core mappings: */
|
||||
new_topology = arch_update_cpu_topology();
|
||||
|
||||
|
@ -2358,7 +2385,7 @@ match3:
|
|||
dattr_cur = dattr_new;
|
||||
ndoms_cur = ndoms_new;
|
||||
|
||||
register_sched_domain_sysctl();
|
||||
update_sched_domain_debugfs();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -408,7 +408,8 @@ void task_join_group_stop(struct task_struct *task)
|
|||
* appropriate lock must be held to stop the target task from exiting
|
||||
*/
|
||||
static struct sigqueue *
|
||||
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
|
||||
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
|
||||
int override_rlimit, const unsigned int sigqueue_flags)
|
||||
{
|
||||
struct sigqueue *q = NULL;
|
||||
struct user_struct *user;
|
||||
|
@ -430,7 +431,16 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
|
|||
rcu_read_unlock();
|
||||
|
||||
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
|
||||
q = kmem_cache_alloc(sigqueue_cachep, flags);
|
||||
/*
|
||||
* Preallocation does not hold sighand::siglock so it can't
|
||||
* use the cache. The lockless caching requires that only
|
||||
* one consumer and only one producer run at a time.
|
||||
*/
|
||||
q = READ_ONCE(t->sigqueue_cache);
|
||||
if (!q || sigqueue_flags)
|
||||
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
|
||||
else
|
||||
WRITE_ONCE(t->sigqueue_cache, NULL);
|
||||
} else {
|
||||
print_dropped_signal(sig);
|
||||
}
|
||||
|
@ -440,20 +450,51 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
|
|||
free_uid(user);
|
||||
} else {
|
||||
INIT_LIST_HEAD(&q->list);
|
||||
q->flags = 0;
|
||||
q->flags = sigqueue_flags;
|
||||
q->user = user;
|
||||
}
|
||||
|
||||
return q;
|
||||
}
|
||||
|
||||
void exit_task_sigqueue_cache(struct task_struct *tsk)
|
||||
{
|
||||
/* Race free because @tsk is mopped up */
|
||||
struct sigqueue *q = tsk->sigqueue_cache;
|
||||
|
||||
if (q) {
|
||||
tsk->sigqueue_cache = NULL;
|
||||
/*
|
||||
* Hand it back to the cache as the task might
|
||||
* be self reaping which would leak the object.
|
||||
*/
|
||||
kmem_cache_free(sigqueue_cachep, q);
|
||||
}
|
||||
}
|
||||
|
||||
static void sigqueue_cache_or_free(struct sigqueue *q)
|
||||
{
|
||||
/*
|
||||
* Cache one sigqueue per task. This pairs with the consumer side
|
||||
* in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the
|
||||
* compiler from store tearing and to tell KCSAN that the data race
|
||||
* is intentional when run without holding current->sighand->siglock,
|
||||
* which is fine as current obviously cannot run __sigqueue_free()
|
||||
* concurrently.
|
||||
*/
|
||||
if (!READ_ONCE(current->sigqueue_cache))
|
||||
WRITE_ONCE(current->sigqueue_cache, q);
|
||||
else
|
||||
kmem_cache_free(sigqueue_cachep, q);
|
||||
}
|
||||
|
||||
static void __sigqueue_free(struct sigqueue *q)
|
||||
{
|
||||
if (q->flags & SIGQUEUE_PREALLOC)
|
||||
return;
|
||||
if (atomic_dec_and_test(&q->user->sigpending))
|
||||
free_uid(q->user);
|
||||
kmem_cache_free(sigqueue_cachep, q);
|
||||
sigqueue_cache_or_free(q);
|
||||
}
|
||||
|
||||
void flush_sigqueue(struct sigpending *queue)
|
||||
|
@ -1111,7 +1152,8 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc
|
|||
else
|
||||
override_rlimit = 0;
|
||||
|
||||
q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit);
|
||||
q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit, 0);
|
||||
|
||||
if (q) {
|
||||
list_add_tail(&q->list, &pending->list);
|
||||
switch ((unsigned long) info) {
|
||||
|
@ -1806,12 +1848,7 @@ EXPORT_SYMBOL(kill_pid);
|
|||
*/
|
||||
struct sigqueue *sigqueue_alloc(void)
|
||||
{
|
||||
struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0);
|
||||
|
||||
if (q)
|
||||
q->flags |= SIGQUEUE_PREALLOC;
|
||||
|
||||
return q;
|
||||
return __sigqueue_alloc(-1, current, GFP_KERNEL, 0, SIGQUEUE_PREALLOC);
|
||||
}
|
||||
|
||||
void sigqueue_free(struct sigqueue *q)
|
||||
|
|
|
@ -409,6 +409,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
|
|||
work->fn = fn;
|
||||
work->arg = arg;
|
||||
work->done = done;
|
||||
work->caller = _RET_IP_;
|
||||
if (cpu_stop_queue_work(cpu, work))
|
||||
queued = true;
|
||||
}
|
||||
|
|
|
@ -184,17 +184,6 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
|
|||
int sysctl_legacy_va_layout;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
static int min_sched_granularity_ns = 100000; /* 100 usecs */
|
||||
static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
|
||||
static int min_wakeup_granularity_ns; /* 0 usecs */
|
||||
static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
|
||||
#ifdef CONFIG_SMP
|
||||
static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
|
||||
static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
|
||||
#endif /* CONFIG_SMP */
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
||||
#ifdef CONFIG_COMPACTION
|
||||
static int min_extfrag_threshold;
|
||||
static int max_extfrag_threshold = 1000;
|
||||
|
@ -1659,58 +1648,6 @@ static struct ctl_table kern_table[] = {
|
|||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
{
|
||||
.procname = "sched_min_granularity_ns",
|
||||
.data = &sysctl_sched_min_granularity,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_proc_update_handler,
|
||||
.extra1 = &min_sched_granularity_ns,
|
||||
.extra2 = &max_sched_granularity_ns,
|
||||
},
|
||||
{
|
||||
.procname = "sched_latency_ns",
|
||||
.data = &sysctl_sched_latency,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_proc_update_handler,
|
||||
.extra1 = &min_sched_granularity_ns,
|
||||
.extra2 = &max_sched_granularity_ns,
|
||||
},
|
||||
{
|
||||
.procname = "sched_wakeup_granularity_ns",
|
||||
.data = &sysctl_sched_wakeup_granularity,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_proc_update_handler,
|
||||
.extra1 = &min_wakeup_granularity_ns,
|
||||
.extra2 = &max_wakeup_granularity_ns,
|
||||
},
|
||||
#ifdef CONFIG_SMP
|
||||
{
|
||||
.procname = "sched_tunable_scaling",
|
||||
.data = &sysctl_sched_tunable_scaling,
|
||||
.maxlen = sizeof(enum sched_tunable_scaling),
|
||||
.mode = 0644,
|
||||
.proc_handler = sched_proc_update_handler,
|
||||
.extra1 = &min_sched_tunable_scaling,
|
||||
.extra2 = &max_sched_tunable_scaling,
|
||||
},
|
||||
{
|
||||
.procname = "sched_migration_cost_ns",
|
||||
.data = &sysctl_sched_migration_cost,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "sched_nr_migrate",
|
||||
.data = &sysctl_sched_nr_migrate,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
#ifdef CONFIG_SCHEDSTATS
|
||||
{
|
||||
.procname = "sched_schedstats",
|
||||
|
@ -1722,37 +1659,7 @@ static struct ctl_table kern_table[] = {
|
|||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif /* CONFIG_SCHEDSTATS */
|
||||
#endif /* CONFIG_SMP */
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
{
|
||||
.procname = "numa_balancing_scan_delay_ms",
|
||||
.data = &sysctl_numa_balancing_scan_delay,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_scan_period_min_ms",
|
||||
.data = &sysctl_numa_balancing_scan_period_min,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_scan_period_max_ms",
|
||||
.data = &sysctl_numa_balancing_scan_period_max,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing_scan_size_mb",
|
||||
.data = &sysctl_numa_balancing_scan_size,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ONE,
|
||||
},
|
||||
{
|
||||
.procname = "numa_balancing",
|
||||
.data = NULL, /* filled in by handler */
|
||||
|
@ -1763,7 +1670,6 @@ static struct ctl_table kern_table[] = {
|
|||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
{
|
||||
.procname = "sched_rt_period_us",
|
||||
.data = &sysctl_sched_rt_period,
|
||||
|
|
|
@ -1710,7 +1710,6 @@ config LATENCYTOP
|
|||
select KALLSYMS_ALL
|
||||
select STACKTRACE
|
||||
select SCHEDSTATS
|
||||
select SCHED_DEBUG
|
||||
help
|
||||
Enable this option if you want to use the LatencyTOP tool
|
||||
to find out which userspace is blocking on what kernel operations.
|
||||
|
|
|
@ -60,6 +60,7 @@
|
|||
#include <linux/prefetch.h>
|
||||
#include <linux/if_vlan.h>
|
||||
#include <linux/mpls.h>
|
||||
#include <linux/kcov.h>
|
||||
|
||||
#include <net/protocol.h>
|
||||
#include <net/dst.h>
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <linux/if_arp.h>
|
||||
#include <linux/netdevice.h>
|
||||
#include <linux/rtnetlink.h>
|
||||
#include <linux/kcov.h>
|
||||
#include <net/mac80211.h>
|
||||
#include <net/ieee80211_radiotap.h>
|
||||
#include "ieee80211_i.h"
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <linux/etherdevice.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/kcov.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <net/mac80211.h>
|
||||
#include <net/ieee80211_radiotap.h>
|
||||
|
|
Loading…
Reference in New Issue