423 lines
12 KiB
C
423 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/*
|
|
* TencentOS Resources Quality Monitor
|
|
*
|
|
* Aim to evaluate the Quality of Service of system resources, including cpu,
|
|
* memory, io and network.
|
|
*
|
|
* Copyright (c) 2021 Tencent. All Rights reserved.
|
|
* Author: Jiang Biao <benbjiang@tencent.com>
|
|
*/
|
|
|
|
#include <linux/seq_file.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/cgroup.h>
|
|
#include <linux/module.h>
|
|
#include <linux/rqm.h>
|
|
#include <linux/sched/loadavg.h>
|
|
#include "sched.h"
|
|
|
|
#define RQM_FREQ (2*HZ+1) /* 2 sec intervals */
|
|
#define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
|
|
|
|
/* Account the exec information of cgroups that on the CPU */
|
|
struct rqm_cgroup_cpu_stat {
|
|
unsigned long long cgroup_exec_runtime;
|
|
unsigned long long cgroup_prio_exec_runtime[CGROUP_PRIORITY_MAX];
|
|
unsigned long long cgroup_prio_stalled_time;
|
|
unsigned long long cgroup_stalled_time;
|
|
};
|
|
|
|
DEFINE_PER_CPU(struct rqm_cgroup_cpu_stat, rqm_cgroup_cpu_stat);
|
|
|
|
/* Account the memory allocation information of cgroup that on the CPU */
|
|
struct rqm_cgroup_mem_stat {
|
|
struct rqm_group_mem cgroup_mem_stat[CGROUP_PRIORITY_MAX][MAX_ORDER];
|
|
};
|
|
|
|
DEFINE_PER_CPU(struct rqm_cgroup_mem_stat, rqm_cgroup_mem_stat);
|
|
|
|
unsigned int sysctl_rqm = 0;
|
|
|
|
static struct delayed_work system_rqm_work;
|
|
struct rqm_group rqm_system;
|
|
|
|
bool rqm_enable = true;
|
|
static int __init setup_rqm(char *str)
|
|
{
|
|
return kstrtobool(str, &rqm_enable) == 0;
|
|
}
|
|
__setup("rqm=", setup_rqm);
|
|
|
|
void account_rqm_cgroup_cpu_stat(struct cgroup *cgrp, u64 tmp)
|
|
{
|
|
u16 prio;
|
|
|
|
if (!sysctl_rqm)
|
|
return;
|
|
|
|
prio = cgrp->priority;
|
|
__this_cpu_add(rqm_cgroup_cpu_stat.cgroup_prio_exec_runtime[prio], tmp);
|
|
__this_cpu_add(rqm_cgroup_cpu_stat.cgroup_exec_runtime, tmp);
|
|
}
|
|
|
|
/* sched_rqm_dequeued is called when task dequeue or prepare to run */
|
|
void sched_rqm_dequeued(struct rq *rq, struct task_struct *t)
|
|
{
|
|
u16 prio;
|
|
long delta, origin_delta;
|
|
struct cgroup *cgrp;
|
|
struct cgroup_rstat_cpu *rstatc;
|
|
struct rqm_cgroup_cpu_stat *cpu_stat;
|
|
|
|
if (!sysctl_rqm)
|
|
return;
|
|
|
|
cgrp = task_dfl_cgroup(t);
|
|
if (!cgroup_parent(cgrp))
|
|
return;
|
|
|
|
if(!t->sched_rqm.last_queued)
|
|
return;
|
|
|
|
rstatc = per_cpu_ptr(cgrp->rstat_cpu, cpu_of(rq));
|
|
origin_delta = delta = rq_clock(rq) - t->sched_rqm.last_queued;
|
|
/*
|
|
* Delta is the stalled time of this task caused by other cgroup's
|
|
* tasks. But the same cgroup's other task may ran on the cpu during
|
|
* this task is stalled on the cpu. So we must subtract the runtime,
|
|
* we can get the stall time that only caused by other cgroup's tasks.
|
|
*/
|
|
delta -= (rstatc->bstat.cputime.sum_exec_runtime -
|
|
t->sched_rqm.last_cgroup_exec_runtime);
|
|
|
|
t->sched_rqm.last_queued = 0;
|
|
if (delta <= 0)
|
|
return;
|
|
|
|
cpu_stat = per_cpu_ptr(&rqm_cgroup_cpu_stat, cpu_of(rq));
|
|
rstatc->rqm_stat.cgroup_stalled_time += delta;
|
|
cpu_stat->cgroup_stalled_time += delta;
|
|
|
|
prio = cgrp->priority;
|
|
delta = cpu_stat->cgroup_prio_exec_runtime[prio] -
|
|
t->sched_rqm.last_cgroup_prio_exec_runtime;
|
|
delta = origin_delta - delta;
|
|
if (delta <= 0)
|
|
return;
|
|
|
|
rstatc->rqm_stat.cgroup_prio_stalled_time += delta;
|
|
cpu_stat->cgroup_prio_stalled_time += delta;
|
|
}
|
|
|
|
/* wrapper sched_rqm_dequeued */
|
|
static inline void sched_rqm_arrive(struct rq *rq, struct task_struct *t)
|
|
{
|
|
sched_rqm_dequeued(rq, t);
|
|
}
|
|
|
|
void sched_rqm_enqueued(struct rq *rq, struct task_struct *t)
|
|
{
|
|
u16 prio;
|
|
struct cgroup *cgrp;
|
|
struct cgroup_rstat_cpu *rstatc;
|
|
struct rqm_cgroup_cpu_stat *cpu_stat;
|
|
|
|
cgrp = task_dfl_cgroup(t);
|
|
if (!cgroup_parent(cgrp))
|
|
return;
|
|
|
|
prio = cgrp->priority;
|
|
rstatc = per_cpu_ptr(cgrp->rstat_cpu, cpu_of(rq));
|
|
t->sched_rqm.last_cgroup_exec_runtime = rstatc->bstat.cputime.sum_exec_runtime;
|
|
cpu_stat = per_cpu_ptr(&rqm_cgroup_cpu_stat, cpu_of(rq));
|
|
t->sched_rqm.last_cgroup_prio_exec_runtime =
|
|
cpu_stat->cgroup_prio_exec_runtime[prio];
|
|
t->sched_rqm.last_queued = rq_clock(rq);
|
|
}
|
|
|
|
/* wrapper sched_rqm_enqueued */
|
|
static inline void sched_rqm_depart(struct rq *rq, struct task_struct *prev)
|
|
{
|
|
if (!sysctl_rqm)
|
|
return;
|
|
|
|
/* Recalculate the task's stall time */
|
|
sched_rqm_enqueued(rq, prev);
|
|
}
|
|
|
|
void sched_rqm_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
|
|
{
|
|
if (!sysctl_rqm)
|
|
return;
|
|
|
|
if (prev != rq->idle && prev->state == TASK_RUNNING)
|
|
sched_rqm_depart(rq, prev);
|
|
|
|
if (next != rq->idle)
|
|
sched_rqm_arrive(rq, next);
|
|
}
|
|
|
|
#define MAX_LATENCY 100000UL
|
|
|
|
void calc_system_rqm_mem_info(struct rqm_group_mem *rqm_mem_stat)
|
|
{
|
|
unsigned long long alloc_ticks, alloc_stall, time_avg, alloc_avg;
|
|
|
|
alloc_ticks = rqm_mem_stat->cgroup_mem_ticks - rqm_system.last_mem_stat.cgroup_mem_ticks;
|
|
alloc_stall = rqm_mem_stat->cgroup_mem_stall - rqm_system.last_mem_stat.cgroup_mem_stall;
|
|
|
|
/* Some memory allocation has occured */
|
|
if (alloc_ticks > 0)
|
|
alloc_avg = alloc_stall / alloc_ticks;
|
|
else
|
|
alloc_avg = 0;
|
|
|
|
/* Set a min score in case of alloc_avg > MAX_LATENCY */
|
|
if (alloc_avg >= MAX_LATENCY)
|
|
time_avg = 1;
|
|
else
|
|
time_avg = (MAX_LATENCY - alloc_avg) * 100 / MAX_LATENCY;
|
|
|
|
|
|
time_avg = time_avg * FIXED_1;
|
|
rqm_system.score[RQM_MEM] = calc_load(rqm_system.score[RQM_MEM], EXP_10s, time_avg);
|
|
rqm_system.last_mem_stat.cgroup_mem_ticks = rqm_mem_stat->cgroup_mem_ticks;
|
|
rqm_system.last_mem_stat.cgroup_mem_stall = rqm_mem_stat->cgroup_mem_stall;
|
|
}
|
|
|
|
void collect_all_mem_info(struct rqm_cgroup_mem_stat *mem_stat,
|
|
struct rqm_group_mem *temp_group_mem)
|
|
{
|
|
int order, priority;
|
|
|
|
for (priority = 0; priority < CGROUP_PRIORITY_MAX; priority++) {
|
|
for (order = 0; order < MAX_ORDER; order++) {
|
|
/*
|
|
* Alloc multiple pages may consumes more time than one page, so just reguard
|
|
* it as multiple allocation for one page.
|
|
*
|
|
* TODO: this logic may be optimized later.
|
|
*/
|
|
temp_group_mem->cgroup_mem_ticks +=
|
|
mem_stat->cgroup_mem_stat[priority][order].cgroup_mem_ticks
|
|
* (1 << order);
|
|
temp_group_mem->cgroup_mem_stall +=
|
|
mem_stat->cgroup_mem_stat[priority][order].cgroup_mem_stall;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Calcaulate the system's rqm information periodically. */
|
|
static void system_rqm_avgs_work(struct work_struct *work)
|
|
{
|
|
int cpu;
|
|
unsigned long long stalled_time = 0, prio_stalled_time = 0, exec_runtime = 0;
|
|
long long delta_stalled_time, delta_prio_stalled_time, delta_exec_runtime;
|
|
unsigned long score;
|
|
struct rqm_cgroup_cpu_stat *cpu_stat;
|
|
struct rqm_cgroup_mem_stat *mem_stat;
|
|
struct rqm_group_mem rqm_mem_stat = {
|
|
.cgroup_mem_ticks = 0,
|
|
.cgroup_mem_stall = 0
|
|
};
|
|
|
|
if (!sysctl_rqm)
|
|
goto out;
|
|
|
|
for_each_online_cpu(cpu) {
|
|
cpu_stat = per_cpu_ptr(&rqm_cgroup_cpu_stat, cpu);
|
|
mem_stat = per_cpu_ptr(&rqm_cgroup_mem_stat, cpu);
|
|
exec_runtime += cpu_stat->cgroup_exec_runtime;
|
|
prio_stalled_time += cpu_stat->cgroup_prio_stalled_time;
|
|
stalled_time += cpu_stat->cgroup_stalled_time;
|
|
|
|
collect_all_mem_info(mem_stat, &rqm_mem_stat);
|
|
|
|
}
|
|
|
|
delta_exec_runtime = exec_runtime - rqm_system.last_exec_runtime;
|
|
delta_prio_stalled_time = prio_stalled_time - rqm_system.last_prio_stalled_time;
|
|
delta_stalled_time = stalled_time - rqm_system.last_stalled_time;
|
|
if (delta_exec_runtime > 0) {
|
|
long long delta_denom;
|
|
|
|
delta_denom = max((delta_exec_runtime + delta_prio_stalled_time), (long long)1);
|
|
score = (delta_exec_runtime * 100 * FIXED_1) / delta_denom;
|
|
rqm_system.score[RQM_PRIO_CPU] = calc_load(rqm_system.score[RQM_PRIO_CPU],
|
|
EXP_10s, score);
|
|
|
|
delta_denom = max((delta_exec_runtime + delta_stalled_time), (long long)1);
|
|
score = (delta_exec_runtime * 100 * FIXED_1) / delta_denom;
|
|
rqm_system.score[RQM_CPU] = calc_load(rqm_system.score[RQM_CPU],
|
|
EXP_10s, score);
|
|
|
|
rqm_system.last_exec_runtime = exec_runtime;
|
|
rqm_system.last_prio_stalled_time = prio_stalled_time;
|
|
rqm_system.last_stalled_time = stalled_time;
|
|
} else {
|
|
rqm_system.score[RQM_PRIO_CPU] = calc_load(rqm_system.score[RQM_PRIO_CPU],
|
|
EXP_10s, 100 * FIXED_1);
|
|
rqm_system.score[RQM_CPU] = calc_load(rqm_system.score[RQM_CPU],
|
|
EXP_10s, 100 * FIXED_1);
|
|
}
|
|
|
|
calc_system_rqm_mem_info(&rqm_mem_stat);
|
|
|
|
out:
|
|
schedule_delayed_work(&system_rqm_work, RQM_FREQ);
|
|
}
|
|
|
|
|
|
/* Account memory allocation info for rqm. Called in __alloc_pages_nodemask */
|
|
void account_rqm_cgroup_mem_stat(struct mem_cgroup *memcg, int order, int latency)
|
|
{
|
|
struct cgroup *cgrp;
|
|
struct cgroup_rstat_cpu *rstat;
|
|
u16 priority;
|
|
|
|
cgrp = memcg->css.cgroup;
|
|
if (!cgroup_parent(cgrp))
|
|
return;
|
|
|
|
/* Update memory allocation info for per cgroup */
|
|
rstat = this_cpu_ptr(cgrp->rstat_cpu);
|
|
rstat->rqm_mem_stat[order].cgroup_mem_ticks++;
|
|
rstat->rqm_mem_stat[order].cgroup_mem_stall += latency;
|
|
|
|
/* Update memory allocation info for system */
|
|
priority = cgrp->priority;
|
|
__this_cpu_add(rqm_cgroup_mem_stat.cgroup_mem_stat[priority][order].cgroup_mem_ticks, 1);
|
|
__this_cpu_add(rqm_cgroup_mem_stat.cgroup_mem_stat[priority][order].cgroup_mem_stall, latency);
|
|
}
|
|
|
|
void rqm_group_init(struct rqm_group *group)
|
|
{
|
|
int i;
|
|
|
|
if (!rqm_enable)
|
|
return;
|
|
|
|
for (i = 0; i < NR_RQM_TYPES; i++)
|
|
group->score[i] = 100 * FIXED_1;
|
|
}
|
|
|
|
void __init rqm_init(void)
|
|
{
|
|
rqm_group_init(&rqm_system);
|
|
}
|
|
|
|
int rqm_show(struct seq_file *m, struct rqm_group *rqm, enum rqm_type type)
|
|
{
|
|
unsigned long score;
|
|
|
|
score = rqm->score[type];
|
|
seq_printf(m, "%lu.%02lu\n", LOAD_INT(score), LOAD_FRAC(score));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int rqm_io_show(struct seq_file *m, void *v)
|
|
{
|
|
return rqm_show(m, &rqm_system, RQM_IO);
|
|
}
|
|
|
|
static int rqm_mem_show(struct seq_file *m, void *v)
|
|
{
|
|
unsigned long score;
|
|
|
|
score = rqm_system.score[RQM_MEM];
|
|
seq_printf(m, "%lu.%02lu\n", LOAD_INT(score), LOAD_FRAC(score));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int rqm_cpu_show(struct seq_file *m, void *v)
|
|
{
|
|
unsigned long score, prio_score;
|
|
|
|
score = rqm_system.score[RQM_CPU];
|
|
prio_score = rqm_system.score[RQM_PRIO_CPU];
|
|
seq_printf(m, "%lu.%02lu %lu.%02lu\n", LOAD_INT(score), LOAD_FRAC(score),
|
|
LOAD_INT(prio_score), LOAD_FRAC(prio_score));
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int rqm_net_show(struct seq_file *m, void *v)
|
|
{
|
|
// todo
|
|
seq_printf(m, "%lu.%02lu\n",
|
|
LOAD_INT((unsigned long)100), LOAD_FRAC((unsigned long)100));
|
|
return 0;
|
|
}
|
|
|
|
static int rqm_io_open(struct inode *inode, struct file *file)
|
|
{
|
|
return single_open(file, rqm_io_show, NULL);
|
|
}
|
|
|
|
static int rqm_mem_open(struct inode *inode, struct file *file)
|
|
{
|
|
return single_open(file, rqm_mem_show, NULL);
|
|
}
|
|
|
|
static int rqm_cpu_open(struct inode *inode, struct file *file)
|
|
{
|
|
return single_open(file, rqm_cpu_show, NULL);
|
|
}
|
|
|
|
static int rqm_net_open(struct inode *inode, struct file *file)
|
|
{
|
|
return single_open(file, rqm_net_show, NULL);
|
|
}
|
|
|
|
static int rqm_fop_release(struct inode *inode, struct file *file)
|
|
{
|
|
return single_release(inode, file);
|
|
}
|
|
|
|
static const struct file_operations rqm_io_fops = {
|
|
.open = rqm_io_open,
|
|
.read = seq_read,
|
|
.llseek = seq_lseek,
|
|
.release = rqm_fop_release,
|
|
};
|
|
|
|
static const struct file_operations rqm_memory_fops = {
|
|
.open = rqm_mem_open,
|
|
.read = seq_read,
|
|
.llseek = seq_lseek,
|
|
.release = rqm_fop_release,
|
|
};
|
|
|
|
static const struct file_operations rqm_cpu_fops = {
|
|
.open = rqm_cpu_open,
|
|
.read = seq_read,
|
|
.llseek = seq_lseek,
|
|
.release = rqm_fop_release,
|
|
};
|
|
|
|
static const struct file_operations rqm_net_fops = {
|
|
.open = rqm_net_open,
|
|
.read = seq_read,
|
|
.llseek = seq_lseek,
|
|
.release = rqm_fop_release,
|
|
};
|
|
|
|
static int __init rqm_proc_init(void)
|
|
{
|
|
proc_mkdir("rue", NULL);
|
|
proc_mkdir("rue/quality", NULL);
|
|
proc_create("rue/quality/io", 0, NULL, &rqm_io_fops);
|
|
proc_create("rue/quality/memory", 0, NULL, &rqm_memory_fops);
|
|
proc_create("rue/quality/cpu", 0, NULL, &rqm_cpu_fops);
|
|
proc_create("rue/quality/net", 0, NULL, &rqm_net_fops);
|
|
|
|
INIT_DELAYED_WORK(&system_rqm_work, system_rqm_avgs_work);
|
|
schedule_delayed_work(&system_rqm_work, RQM_FREQ);
|
|
|
|
return 0;
|
|
}
|
|
module_init(rqm_proc_init);
|