OpenCloudOS-Kernel/kernel/sched/rqm.c

423 lines
12 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* TencentOS Resources Quality Monitor
*
* Aim to evaluate the Quality of Service of system resources, including cpu,
* memory, io and network.
*
* Copyright (c) 2021 Tencent. All Rights reserved.
* Author: Jiang Biao <benbjiang@tencent.com>
*/
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/rqm.h>
#include <linux/sched/loadavg.h>
#include "sched.h"
#define RQM_FREQ (2*HZ+1) /* 2 sec intervals */
#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
/* Account the exec information of cgroups that on the CPU */
struct rqm_cgroup_cpu_stat {
unsigned long long cgroup_exec_runtime;
unsigned long long cgroup_prio_exec_runtime[CGROUP_PRIORITY_MAX];
unsigned long long cgroup_prio_stalled_time;
unsigned long long cgroup_stalled_time;
};
DEFINE_PER_CPU(struct rqm_cgroup_cpu_stat, rqm_cgroup_cpu_stat);
/* Account the memory allocation information of cgroup that on the CPU */
struct rqm_cgroup_mem_stat {
struct rqm_group_mem cgroup_mem_stat[CGROUP_PRIORITY_MAX][MAX_ORDER];
};
DEFINE_PER_CPU(struct rqm_cgroup_mem_stat, rqm_cgroup_mem_stat);
unsigned int sysctl_rqm = 0;
static struct delayed_work system_rqm_work;
struct rqm_group rqm_system;
bool rqm_enable = true;
static int __init setup_rqm(char *str)
{
return kstrtobool(str, &rqm_enable) == 0;
}
__setup("rqm=", setup_rqm);
void account_rqm_cgroup_cpu_stat(struct cgroup *cgrp, u64 tmp)
{
u16 prio;
if (!sysctl_rqm)
return;
prio = cgrp->priority;
__this_cpu_add(rqm_cgroup_cpu_stat.cgroup_prio_exec_runtime[prio], tmp);
__this_cpu_add(rqm_cgroup_cpu_stat.cgroup_exec_runtime, tmp);
}
/* sched_rqm_dequeued is called when task dequeue or prepare to run */
void sched_rqm_dequeued(struct rq *rq, struct task_struct *t)
{
u16 prio;
long delta, origin_delta;
struct cgroup *cgrp;
struct cgroup_rstat_cpu *rstatc;
struct rqm_cgroup_cpu_stat *cpu_stat;
if (!sysctl_rqm)
return;
cgrp = task_dfl_cgroup(t);
if (!cgroup_parent(cgrp))
return;
if(!t->sched_rqm.last_queued)
return;
rstatc = per_cpu_ptr(cgrp->rstat_cpu, cpu_of(rq));
origin_delta = delta = rq_clock(rq) - t->sched_rqm.last_queued;
/*
* Delta is the stalled time of this task caused by other cgroup's
* tasks. But the same cgroup's other task may ran on the cpu during
* this task is stalled on the cpu. So we must subtract the runtime,
* we can get the stall time that only caused by other cgroup's tasks.
*/
delta -= (rstatc->bstat.cputime.sum_exec_runtime -
t->sched_rqm.last_cgroup_exec_runtime);
t->sched_rqm.last_queued = 0;
if (delta <= 0)
return;
cpu_stat = per_cpu_ptr(&rqm_cgroup_cpu_stat, cpu_of(rq));
rstatc->rqm_stat.cgroup_stalled_time += delta;
cpu_stat->cgroup_stalled_time += delta;
prio = cgrp->priority;
delta = cpu_stat->cgroup_prio_exec_runtime[prio] -
t->sched_rqm.last_cgroup_prio_exec_runtime;
delta = origin_delta - delta;
if (delta <= 0)
return;
rstatc->rqm_stat.cgroup_prio_stalled_time += delta;
cpu_stat->cgroup_prio_stalled_time += delta;
}
/* wrapper sched_rqm_dequeued */
static inline void sched_rqm_arrive(struct rq *rq, struct task_struct *t)
{
sched_rqm_dequeued(rq, t);
}
void sched_rqm_enqueued(struct rq *rq, struct task_struct *t)
{
u16 prio;
struct cgroup *cgrp;
struct cgroup_rstat_cpu *rstatc;
struct rqm_cgroup_cpu_stat *cpu_stat;
cgrp = task_dfl_cgroup(t);
if (!cgroup_parent(cgrp))
return;
prio = cgrp->priority;
rstatc = per_cpu_ptr(cgrp->rstat_cpu, cpu_of(rq));
t->sched_rqm.last_cgroup_exec_runtime = rstatc->bstat.cputime.sum_exec_runtime;
cpu_stat = per_cpu_ptr(&rqm_cgroup_cpu_stat, cpu_of(rq));
t->sched_rqm.last_cgroup_prio_exec_runtime =
cpu_stat->cgroup_prio_exec_runtime[prio];
t->sched_rqm.last_queued = rq_clock(rq);
}
/* wrapper sched_rqm_enqueued */
static inline void sched_rqm_depart(struct rq *rq, struct task_struct *prev)
{
if (!sysctl_rqm)
return;
/* Recalculate the task's stall time */
sched_rqm_enqueued(rq, prev);
}
void sched_rqm_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
if (!sysctl_rqm)
return;
if (prev != rq->idle && prev->state == TASK_RUNNING)
sched_rqm_depart(rq, prev);
if (next != rq->idle)
sched_rqm_arrive(rq, next);
}
#define MAX_LATENCY 100000UL
void calc_system_rqm_mem_info(struct rqm_group_mem *rqm_mem_stat)
{
unsigned long long alloc_ticks, alloc_stall, time_avg, alloc_avg;
alloc_ticks = rqm_mem_stat->cgroup_mem_ticks - rqm_system.last_mem_stat.cgroup_mem_ticks;
alloc_stall = rqm_mem_stat->cgroup_mem_stall - rqm_system.last_mem_stat.cgroup_mem_stall;
/* Some memory allocation has occured */
if (alloc_ticks > 0)
alloc_avg = alloc_stall / alloc_ticks;
else
alloc_avg = 0;
/* Set a min score in case of alloc_avg > MAX_LATENCY */
if (alloc_avg >= MAX_LATENCY)
time_avg = 1;
else
time_avg = (MAX_LATENCY - alloc_avg) * 100 / MAX_LATENCY;
time_avg = time_avg * FIXED_1;
rqm_system.score[RQM_MEM] = calc_load(rqm_system.score[RQM_MEM], EXP_10s, time_avg);
rqm_system.last_mem_stat.cgroup_mem_ticks = rqm_mem_stat->cgroup_mem_ticks;
rqm_system.last_mem_stat.cgroup_mem_stall = rqm_mem_stat->cgroup_mem_stall;
}
void collect_all_mem_info(struct rqm_cgroup_mem_stat *mem_stat,
struct rqm_group_mem *temp_group_mem)
{
int order, priority;
for (priority = 0; priority < CGROUP_PRIORITY_MAX; priority++) {
for (order = 0; order < MAX_ORDER; order++) {
/*
* Alloc multiple pages may consumes more time than one page, so just reguard
* it as multiple allocation for one page.
*
* TODO: this logic may be optimized later.
*/
temp_group_mem->cgroup_mem_ticks +=
mem_stat->cgroup_mem_stat[priority][order].cgroup_mem_ticks
* (1 << order);
temp_group_mem->cgroup_mem_stall +=
mem_stat->cgroup_mem_stat[priority][order].cgroup_mem_stall;
}
}
}
/* Calcaulate the system's rqm information periodically. */
static void system_rqm_avgs_work(struct work_struct *work)
{
int cpu;
unsigned long long stalled_time = 0, prio_stalled_time = 0, exec_runtime = 0;
long long delta_stalled_time, delta_prio_stalled_time, delta_exec_runtime;
unsigned long score;
struct rqm_cgroup_cpu_stat *cpu_stat;
struct rqm_cgroup_mem_stat *mem_stat;
struct rqm_group_mem rqm_mem_stat = {
.cgroup_mem_ticks = 0,
.cgroup_mem_stall = 0
};
if (!sysctl_rqm)
goto out;
for_each_online_cpu(cpu) {
cpu_stat = per_cpu_ptr(&rqm_cgroup_cpu_stat, cpu);
mem_stat = per_cpu_ptr(&rqm_cgroup_mem_stat, cpu);
exec_runtime += cpu_stat->cgroup_exec_runtime;
prio_stalled_time += cpu_stat->cgroup_prio_stalled_time;
stalled_time += cpu_stat->cgroup_stalled_time;
collect_all_mem_info(mem_stat, &rqm_mem_stat);
}
delta_exec_runtime = exec_runtime - rqm_system.last_exec_runtime;
delta_prio_stalled_time = prio_stalled_time - rqm_system.last_prio_stalled_time;
delta_stalled_time = stalled_time - rqm_system.last_stalled_time;
if (delta_exec_runtime > 0) {
long long delta_denom;
delta_denom = max((delta_exec_runtime + delta_prio_stalled_time), (long long)1);
score = (delta_exec_runtime * 100 * FIXED_1) / delta_denom;
rqm_system.score[RQM_PRIO_CPU] = calc_load(rqm_system.score[RQM_PRIO_CPU],
EXP_10s, score);
delta_denom = max((delta_exec_runtime + delta_stalled_time), (long long)1);
score = (delta_exec_runtime * 100 * FIXED_1) / delta_denom;
rqm_system.score[RQM_CPU] = calc_load(rqm_system.score[RQM_CPU],
EXP_10s, score);
rqm_system.last_exec_runtime = exec_runtime;
rqm_system.last_prio_stalled_time = prio_stalled_time;
rqm_system.last_stalled_time = stalled_time;
} else {
rqm_system.score[RQM_PRIO_CPU] = calc_load(rqm_system.score[RQM_PRIO_CPU],
EXP_10s, 100 * FIXED_1);
rqm_system.score[RQM_CPU] = calc_load(rqm_system.score[RQM_CPU],
EXP_10s, 100 * FIXED_1);
}
calc_system_rqm_mem_info(&rqm_mem_stat);
out:
schedule_delayed_work(&system_rqm_work, RQM_FREQ);
}
/* Account memory allocation info for rqm. Called in __alloc_pages_nodemask */
void account_rqm_cgroup_mem_stat(struct mem_cgroup *memcg, int order, int latency)
{
struct cgroup *cgrp;
struct cgroup_rstat_cpu *rstat;
u16 priority;
cgrp = memcg->css.cgroup;
if (!cgroup_parent(cgrp))
return;
/* Update memory allocation info for per cgroup */
rstat = this_cpu_ptr(cgrp->rstat_cpu);
rstat->rqm_mem_stat[order].cgroup_mem_ticks++;
rstat->rqm_mem_stat[order].cgroup_mem_stall += latency;
/* Update memory allocation info for system */
priority = cgrp->priority;
__this_cpu_add(rqm_cgroup_mem_stat.cgroup_mem_stat[priority][order].cgroup_mem_ticks, 1);
__this_cpu_add(rqm_cgroup_mem_stat.cgroup_mem_stat[priority][order].cgroup_mem_stall, latency);
}
void rqm_group_init(struct rqm_group *group)
{
int i;
if (!rqm_enable)
return;
for (i = 0; i < NR_RQM_TYPES; i++)
group->score[i] = 100 * FIXED_1;
}
void __init rqm_init(void)
{
rqm_group_init(&rqm_system);
}
int rqm_show(struct seq_file *m, struct rqm_group *rqm, enum rqm_type type)
{
unsigned long score;
score = rqm->score[type];
seq_printf(m, "%lu.%02lu\n", LOAD_INT(score), LOAD_FRAC(score));
return 0;
}
static int rqm_io_show(struct seq_file *m, void *v)
{
return rqm_show(m, &rqm_system, RQM_IO);
}
static int rqm_mem_show(struct seq_file *m, void *v)
{
unsigned long score;
score = rqm_system.score[RQM_MEM];
seq_printf(m, "%lu.%02lu\n", LOAD_INT(score), LOAD_FRAC(score));
return 0;
}
static int rqm_cpu_show(struct seq_file *m, void *v)
{
unsigned long score, prio_score;
score = rqm_system.score[RQM_CPU];
prio_score = rqm_system.score[RQM_PRIO_CPU];
seq_printf(m, "%lu.%02lu %lu.%02lu\n", LOAD_INT(score), LOAD_FRAC(score),
LOAD_INT(prio_score), LOAD_FRAC(prio_score));
return 0;
}
static int rqm_net_show(struct seq_file *m, void *v)
{
// todo
seq_printf(m, "%lu.%02lu\n",
LOAD_INT((unsigned long)100), LOAD_FRAC((unsigned long)100));
return 0;
}
static int rqm_io_open(struct inode *inode, struct file *file)
{
return single_open(file, rqm_io_show, NULL);
}
static int rqm_mem_open(struct inode *inode, struct file *file)
{
return single_open(file, rqm_mem_show, NULL);
}
static int rqm_cpu_open(struct inode *inode, struct file *file)
{
return single_open(file, rqm_cpu_show, NULL);
}
static int rqm_net_open(struct inode *inode, struct file *file)
{
return single_open(file, rqm_net_show, NULL);
}
static int rqm_fop_release(struct inode *inode, struct file *file)
{
return single_release(inode, file);
}
static const struct file_operations rqm_io_fops = {
.open = rqm_io_open,
.read = seq_read,
.llseek = seq_lseek,
.release = rqm_fop_release,
};
static const struct file_operations rqm_memory_fops = {
.open = rqm_mem_open,
.read = seq_read,
.llseek = seq_lseek,
.release = rqm_fop_release,
};
static const struct file_operations rqm_cpu_fops = {
.open = rqm_cpu_open,
.read = seq_read,
.llseek = seq_lseek,
.release = rqm_fop_release,
};
static const struct file_operations rqm_net_fops = {
.open = rqm_net_open,
.read = seq_read,
.llseek = seq_lseek,
.release = rqm_fop_release,
};
static int __init rqm_proc_init(void)
{
proc_mkdir("rue", NULL);
proc_mkdir("rue/quality", NULL);
proc_create("rue/quality/io", 0, NULL, &rqm_io_fops);
proc_create("rue/quality/memory", 0, NULL, &rqm_memory_fops);
proc_create("rue/quality/cpu", 0, NULL, &rqm_cpu_fops);
proc_create("rue/quality/net", 0, NULL, &rqm_net_fops);
INIT_DELAYED_WORK(&system_rqm_work, system_rqm_avgs_work);
schedule_delayed_work(&system_rqm_work, RQM_FREQ);
return 0;
}
module_init(rqm_proc_init);