Merge branch 'zegao/scx3' into 'master' (merge request !150)

Add some general scx in-kernel support
5aec0abf10 rue/scx: Kill user tasks in SCHED_EXT when scheduler is gone
a1752a5760 rue/scx: Add readonly sysctl knob kernel.cpu_qos for SCHED_BT compatibility
ed0889e48a rue/scx: Add /proc/bt_stat to maintain SCHED_BT compatibility
8c320a09af rue/scx: Add cpu.offline to maintain SCHED_BT compatibility
2b9d28baab rue/scx: Add cpu.scx to the cpu cgroup controller
576ee0803a rue/scx: Add /proc/scx_stat to do scx cputime accounting
67d151255e rue/scx: Fix lockdep warn on printk with rq lock held
ebf91df4dc rue/scx: Reorder scx_fork_rwsem, cpu_hotplug_lock and scx_cgroup_rwsem
This commit is contained in:
frankjpliu 2024-08-23 11:40:38 +00:00
commit 897ad8fab4
7 changed files with 301 additions and 9 deletions

View File

@ -208,9 +208,110 @@ static const struct proc_ops stat_proc_ops = {
.proc_release = single_release,
};
#ifdef CONFIG_SCHED_CLASS_EXT
static int scx_show_stat(struct seq_file *p, void *v)
{
int i;
u64 user, nice, system, idle, iowait, irq, softirq, steal, scx;
u64 guest, guest_nice;
user = nice = system = idle = iowait =
irq = softirq = steal = scx = 0;
guest = guest_nice = 0;
for_each_possible_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
kcpustat_cpu_fetch(&kcpustat, i);
user += cpustat[CPUTIME_USER];
nice += cpustat[CPUTIME_NICE];
system += cpustat[CPUTIME_SYSTEM];
idle += get_idle_time(&kcpustat, i);
iowait += get_iowait_time(&kcpustat, i);
irq += cpustat[CPUTIME_IRQ];
softirq += cpustat[CPUTIME_SOFTIRQ];
steal += cpustat[CPUTIME_STEAL];
guest += cpustat[CPUTIME_GUEST];
guest_nice += cpustat[CPUTIME_GUEST_NICE];
scx += cpustat[CPUTIME_SCX];
}
seq_put_decimal_ull(p, "cpu ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(scx));
seq_putc(p, '\n');
for_each_online_cpu(i) {
struct kernel_cpustat kcpustat;
u64 *cpustat = kcpustat.cpustat;
kcpustat_cpu_fetch(&kcpustat, i);
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
user = cpustat[CPUTIME_USER];
nice = cpustat[CPUTIME_NICE];
system = cpustat[CPUTIME_SYSTEM];
idle = get_idle_time(&kcpustat, i);
iowait = get_iowait_time(&kcpustat, i);
irq = cpustat[CPUTIME_IRQ];
softirq = cpustat[CPUTIME_SOFTIRQ];
steal = cpustat[CPUTIME_STEAL];
guest = cpustat[CPUTIME_GUEST];
guest_nice = cpustat[CPUTIME_GUEST_NICE];
scx = cpustat[CPUTIME_SCX];
seq_printf(p, "cpu%d", i);
seq_put_decimal_ull(p, " ", nsec_to_clock_t(user));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(nice));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(system));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(idle));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(iowait));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(irq));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(softirq));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(steal));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(guest_nice));
seq_put_decimal_ull(p, " ", nsec_to_clock_t(scx));
seq_putc(p, '\n');
}
seq_putc(p, '\n');
return 0;
}
static int scx_stat_open(struct inode *inode, struct file *file)
{
unsigned int size = 1024 + 128 * num_online_cpus();
return single_open_size(file, scx_show_stat, NULL, size);
}
static const struct proc_ops scx_stat_proc_ops = {
.proc_flags = PROC_ENTRY_PERMANENT,
.proc_open = scx_stat_open,
.proc_read_iter = seq_read_iter,
.proc_lseek = seq_lseek,
.proc_release = single_release,
};
#endif
static int __init proc_stat_init(void)
{
proc_create("stat", 0, NULL, &stat_proc_ops);
#ifdef CONFIG_SCHED_CLASS_EXT
proc_create("scx_stat", 0, NULL, &scx_stat_proc_ops);
proc_create("bt_stat", 0, NULL, &scx_stat_proc_ops);
#endif
return 0;
}
fs_initcall(proc_stat_init);

View File

@ -30,6 +30,9 @@ enum cpu_usage_stat {
CPUTIME_GUEST_NICE,
#ifdef CONFIG_SCHED_CORE
CPUTIME_FORCEIDLE,
#endif
#ifdef CONFIG_SCHED_CLASS_EXT
CPUTIME_SCX,
#endif
NR_STATS,
};

View File

@ -4883,6 +4883,14 @@ int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
}
#endif
rseq_migrate(p);
#ifdef CONFIG_EXT_GROUP_SCHED
if (scx_enabled()) {
if (p->sched_class != &ext_sched_class && p->sched_task_group->scx)
p->sched_class = &ext_sched_class;
else if (p->sched_class == &ext_sched_class && !p->sched_task_group->scx)
p->sched_class = &fair_sched_class;
}
#endif
/*
* We're setting the CPU for the first time, we don't migrate,
* so use __set_task_cpu().
@ -10635,6 +10643,9 @@ void sched_move_task(struct task_struct *tsk)
struct task_group *group;
struct rq_flags rf;
struct rq *rq;
#ifdef CONFIG_EXT_GROUP_SCHED
const struct sched_class *prev_class;
#endif
rq = task_rq_lock(tsk, &rf);
/*
@ -10656,8 +10667,20 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
#ifdef CONFIG_EXT_GROUP_SCHED
prev_class = tsk->sched_class;
if (scx_enabled()) {
if (prev_class != &ext_sched_class && group->scx)
tsk->sched_class = &ext_sched_class;
else if (prev_class == &ext_sched_class && !group->scx)
tsk->sched_class = &fair_sched_class;
}
#endif
scx_move_task(tsk);
#ifdef CONFIG_EXT_GROUP_SCHED
check_class_changing(rq, tsk, prev_class);
#endif
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running) {
@ -10669,7 +10692,9 @@ void sched_move_task(struct task_struct *tsk)
*/
resched_curr(rq);
}
#ifdef CONFIG_EXT_GROUP_SCHED
check_class_changed(rq, tsk, prev_class, tsk->prio);
#endif
unlock:
task_rq_unlock(rq, tsk, &rf);
}
@ -11416,6 +11441,31 @@ static int cpu_quota_aware_write_u64(struct cgroup_subsys_state *css,
#endif
#ifdef CONFIG_EXT_GROUP_SCHED
static u64 cpu_scx_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct task_group *tg = css_tg(css);
return tg->scx;
}
static int cpu_scx_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 val)
{
struct task_group *tg = css_tg(css);
if (val > 1)
return -ERANGE;
tg = css_tg(css);
if (tg->scx == val)
return 0;
return scx_cpu_cgroup_switch(tg, val);
}
#endif
static struct cftype cpu_legacy_cftypes[] = {
#ifdef CONFIG_CGROUPFS
{
@ -11487,6 +11537,20 @@ static struct cftype cpu_legacy_cftypes[] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
#endif
#ifdef CONFIG_EXT_GROUP_SCHED
{
.name = "scx",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_scx_read_u64,
.write_u64 = cpu_scx_write_u64,
},
{
.name = "offline",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_scx_read_u64,
.write_u64 = cpu_scx_write_u64,
},
#endif
{ } /* Terminate */
};
@ -11715,6 +11779,20 @@ struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
.seq_show = cpu_uclamp_max_show,
.write = cpu_uclamp_max_write,
},
#endif
#ifdef CONFIG_EXT_GROUP_SCHED
[CPU_CFTYPE_SCX_SWITCH] = {
.name = "scx",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_scx_read_u64,
.write_u64 = cpu_scx_write_u64,
},
[CPU_CFTYPE_OFFLINE_SWITCH] = {
.name = "offline",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = cpu_scx_read_u64,
.write_u64 = cpu_scx_write_u64,
},
#endif
{ } /* terminate */
};

View File

@ -114,6 +114,11 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
#ifdef CONFIG_SCHED_CLASS_EXT
if (p->sched_class == &ext_sched_class)
__this_cpu_add(kernel_cpustat.cpustat[CPUTIME_SCX], tmp);
#endif
cgroup_account_cputime_field(p, index, tmp);
#ifdef CONFIG_CGROUP_SLI

View File

@ -200,6 +200,7 @@ struct scx_task_iter {
#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)])
static u32 sysctl_cpu_qos_enabled;
static u32 sysctl_scx_hung_exit = 1;
/* if the highest set bit is N, return a mask with bits [N+1, 31] set */
@ -478,6 +479,17 @@ scx_task_iter_next_filtered_locked(struct scx_task_iter *iter)
return p;
}
static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter)
{
if (iter->locked) {
task_rq_unlock(iter->rq, iter->locked, &iter->rf);
iter->locked = NULL;
return true;
} else {
return false;
}
}
static enum scx_ops_enable_state scx_ops_enable_state(void)
{
return atomic_read(&scx_ops_enable_state_var);
@ -2159,6 +2171,8 @@ static bool check_rq_for_timeouts(struct rq *rq)
struct task_struct *p;
struct rq_flags rf;
bool timed_out = false;
bool log_wo_exit = false;
u32 dur_ms;
rq_lock_irqsave(rq, &rf);
list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) {
@ -2166,7 +2180,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
if (unlikely(time_after(jiffies,
last_runnable + scx_watchdog_timeout))) {
u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable);
dur_ms = jiffies_to_msecs(jiffies - last_runnable);
if (sysctl_scx_hung_exit)
scx_ops_error_kind(SCX_EXIT_ERROR_STALL,
@ -2175,11 +2189,7 @@ static bool check_rq_for_timeouts(struct rq *rq)
dur_ms / 1000,
dur_ms % 1000);
else
pr_warn_ratelimited(
"%s[%d] failed to run for %u.%03us\n",
p->comm, p->pid,
dur_ms / 1000,
dur_ms % 1000);
log_wo_exit = true;
timed_out = true;
break;
@ -2187,6 +2197,13 @@ static bool check_rq_for_timeouts(struct rq *rq)
}
rq_unlock_irqrestore(rq, &rf);
if (log_wo_exit)
pr_warn_ratelimited(
"%s[%d] failed to run for %u.%03us\n",
p->comm, p->pid,
dur_ms / 1000,
dur_ms % 1000);
return timed_out;
}
@ -2636,6 +2653,50 @@ static void scx_cgroup_unlock(void)
percpu_up_write(&scx_cgroup_rwsem);
}
int scx_cpu_cgroup_switch(struct task_group *tg, int val)
{
struct task_struct *p;
struct css_task_iter it;
int ret = 0;
percpu_down_write(&scx_fork_rwsem);
scx_cgroup_lock();
if (!scx_enabled() || READ_ONCE(scx_switching_all)) {
ret = -EPERM;
goto out;
}
tg->scx = val;
css_task_iter_start(&tg->css, 0, &it);
while ((p = css_task_iter_next(&it))) {
const struct sched_class *old_class = p->sched_class;
struct sched_enq_and_set_ctx ctx;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE,
&ctx);
if (old_class != &ext_sched_class && tg->scx)
p->sched_class = &ext_sched_class;
else if (old_class == &ext_sched_class && !tg->scx)
p->sched_class = &fair_sched_class;
check_class_changing(rq, p, old_class);
sched_enq_and_set_task(&ctx);
check_class_changed(rq, p, old_class, p->prio);
task_rq_unlock(rq, p, &rf);
}
css_task_iter_end(&it);
out:
percpu_up_write(&scx_fork_rwsem);
scx_cgroup_unlock();
return ret;
}
#else /* CONFIG_EXT_GROUP_SCHED */
static inline void scx_cgroup_lock(void) {}
@ -2796,6 +2857,7 @@ static void scx_cgroup_exit(void)
if (!(tg->scx_flags & SCX_TG_INITED))
continue;
tg->scx_flags &= ~SCX_TG_INITED;
tg->scx = 0;
if (!scx_ops.cgroup_exit)
continue;
@ -2848,6 +2910,7 @@ static int scx_cgroup_init(void)
return ret;
}
tg->scx_flags |= SCX_TG_INITED;
tg->scx = 0;
rcu_read_lock();
css_put(css);
@ -2926,9 +2989,27 @@ static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags)
static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {}
static int sysctl_cpu_qos_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
if (write)
return -EPERM;
else
return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
static struct ctl_table_header *scx_sysctl_table_hdr;
static struct ctl_table scx_sysctl_table[] = {
{
.procname = "cpu_qos",
.data = &sysctl_cpu_qos_enabled,
.maxlen = sizeof(u32),
.mode = 0644,
.proc_handler = &sysctl_cpu_qos_handler,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "scx_hung_exit",
.data = &sysctl_scx_hung_exit,
@ -2961,6 +3042,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
struct scx_dispatch_q *dsq;
const char *reason;
int i, cpu, kind;
bool last_scx_switch_all;
kind = atomic_read(&scx_exit_kind);
while (true) {
@ -3073,11 +3155,12 @@ forward_progress_guaranteed:
mutex_lock(&scx_ops_enable_mutex);
static_branch_disable(&__scx_switched_all);
last_scx_switch_all = READ_ONCE(scx_switching_all);
WRITE_ONCE(scx_switching_all, false);
/* avoid racing against fork and cgroup changes */
cpus_read_lock();
percpu_down_write(&scx_fork_rwsem);
cpus_read_lock();
scx_cgroup_lock();
spin_lock_irq(&scx_tasks_lock);
@ -3101,6 +3184,19 @@ forward_progress_guaranteed:
check_class_changed(task_rq(p), p, old_class, p->prio);
scx_ops_disable_task(p);
if (alive && !last_scx_switch_all &&
!(p->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
old_class == &ext_sched_class &&
p->sched_class != old_class) {
scx_task_iter_rq_unlock(&sti);
spin_unlock_irq(&scx_tasks_lock);
do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
pr_err("scx: Unexpected scheduler unplug found, killed scx task %d (%s)\n",
task_pid_nr(p), p->comm);
spin_lock_irq(&scx_tasks_lock);
}
}
scx_task_iter_exit(&sti);
spin_unlock_irq(&scx_tasks_lock);
@ -3115,11 +3211,13 @@ forward_progress_guaranteed:
static_branch_disable_cpuslocked(&scx_builtin_idle_enabled);
synchronize_rcu();
sysctl_cpu_qos_enabled = 0;
scx_cgroup_exit();
scx_cgroup_unlock();
percpu_up_write(&scx_fork_rwsem);
cpus_read_unlock();
percpu_up_write(&scx_fork_rwsem);
if (ei->kind >= SCX_EXIT_ERROR) {
printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name);
@ -3338,6 +3436,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops)
static_branch_enable_cpuslocked(&__scx_ops_enabled);
sysctl_cpu_qos_enabled = 1;
/*
* Enable ops for every task. Fork is excluded by scx_fork_rwsem
* preventing new tasks from being added. No need to exclude tasks

View File

@ -254,6 +254,7 @@ void scx_move_task(struct task_struct *p);
void scx_cgroup_finish_attach(void);
void scx_cgroup_cancel_attach(struct cgroup_taskset *tset);
void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight);
int scx_cpu_cgroup_switch(struct task_group *tg, int val);
#else /* CONFIG_EXT_GROUP_SCHED */
static inline int scx_tg_online(struct task_group *tg) { return 0; }
static inline void scx_tg_offline(struct task_group *tg) {}
@ -262,5 +263,6 @@ static inline void scx_move_task(struct task_struct *p) {}
static inline void scx_cgroup_finish_attach(void) {}
static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {}
static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {}
static inline int scx_cpu_cgroup_switch(struct task_group *tg, int val) { return 0; }
#endif /* CONFIG_EXT_GROUP_SCHED */
#endif /* CONFIG_CGROUP_SCHED */

View File

@ -433,6 +433,7 @@ struct task_group {
#ifdef CONFIG_EXT_GROUP_SCHED
u32 scx_flags; /* SCX_TG_* */
u32 scx_weight;
u32 scx;
#endif
struct rcu_head rcu;
@ -3666,6 +3667,8 @@ enum cpu_cftype_id {
CPU_CFTYPE_WEIGHT,
CPU_CFTYPE_WEIGHT_NICE,
CPU_CFTYPE_IDLE,
CPU_CFTYPE_SCX_SWITCH,
CPU_CFTYPE_OFFLINE_SWITCH,
#endif
#ifdef CONFIG_CFS_BANDWIDTH
CPU_CFTYPE_MAX,