OpenCloudOS-Kernel/kernel/cgroup/sli.c

1424 lines
36 KiB
C
Executable File

#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/cgroup.h>
#include <linux/module.h>
#include <linux/psi.h>
#include <linux/rqm.h>
#include <linux/memcontrol.h>
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/sysctl.h>
#include <linux/stacktrace.h>
#include <asm/irq_regs.h>
#include "../sched/sched.h"
#include <linux/cgroup.h>
#include <linux/sli.h>
#include <linux/rculist.h>
#define MAX_STACK_TRACE_DEPTH 64
static DEFINE_STATIC_KEY_FALSE(sli_enabled);
static DEFINE_STATIC_KEY_FALSE(sli_monitor_enabled);
static struct sli_event_monitor default_sli_event_monitor;
static struct workqueue_struct *sli_workqueue;
struct sli_event_control {
int event_type;
int event_id;
int period;
int mbuf_enable;
unsigned long long count;
unsigned long long threshold;
KABI_RESERVE(1);
KABI_RESERVE(2);
};
static const char *schedlat_theshold_name[] = {
"schedlat_wait_threshold=",
"schedlat_block_threshold=",
"schedlat_ioblock_threshold=",
"schedlat_sleep_threshold=",
"schedlat_longsys_threshold=",
"schedlat_rundelay_threshold=",
"schedlat_irqtime_threshold="
};
static const char *memlat_threshold_name[] = {
"memlat_global_direct_reclaim_threshold=",
"memlat_memcg_direct_reclaim_threshold=",
"memlat_direct_compact_threshold=",
"memlat_global_direct_swapout_threshold=",
"memlat_memcg_direct_swapout_threshold=",
"memlat_direct_swapin_threshold="
};
static const char *longterm_threshold_name[] = {
"longterm_rundelay_threshold=",
"longterm_irqtime_threshold="
};
static const char *sanity_check_abbr[] = {
"schedlat_",
"memlat_",
"longterm_",
"period=",
"mbuf_enable="
};
static void sli_proactive_monitor_work(struct work_struct *work);
static unsigned long sli_get_longterm_statistics(struct cgroup *cgrp,
enum sli_longterm_event event_id);
/*
* Convert the ULLONG_MAX to zero when show them to userspace, and convert the zero to
* ULLONG_MAX when write value to control interface.
*/
static inline u64 sli_convert_value(u64 value, bool control_show)
{
if (control_show && value == ULLONG_MAX)
return 0;
if (!control_show && value == 0)
value = ULLONG_MAX;
return value;
}
static void sli_event_monitor_init(struct sli_event_monitor *event_monitor, struct cgroup *cgrp)
{
INIT_LIST_HEAD_RCU(&event_monitor->event_head);
INIT_WORK(&event_monitor->sli_event_work, sli_proactive_monitor_work);
memset(&event_monitor->schedlat_threshold, 0xff, sizeof(event_monitor->schedlat_threshold));
memset(&event_monitor->schedlat_count, 0xff, sizeof(event_monitor->schedlat_count));
memset(&event_monitor->memlat_threshold, 0xff, sizeof(event_monitor->memlat_threshold));
memset(&event_monitor->memlat_count, 0xff, sizeof(event_monitor->memlat_count));
memset(&event_monitor->longterm_threshold, 0xff, sizeof(event_monitor->longterm_threshold));
event_monitor->last_update = jiffies;
event_monitor->cgrp = cgrp;
}
/* Inherit the monitoring event from parent cgroup or global sli_event_monitor */
static int sli_event_inherit(struct cgroup *cgrp)
{
struct sli_event *event, *event_tmp;
struct sli_event_monitor *event_monitor;
struct sli_event_monitor *cgrp_event_monitor = cgrp->cgrp_event_monitor;
if (cgroup_parent(cgrp)->cgrp_event_monitor)
event_monitor = cgroup_parent(cgrp)->cgrp_event_monitor;
else
event_monitor = &default_sli_event_monitor;
rcu_read_lock();
list_for_each_entry_rcu(event, &event_monitor->event_head, event_node) {
struct sli_event *new_event;
new_event = kmalloc(sizeof(struct sli_event), GFP_ATOMIC);
if (!new_event)
goto failed;
/*
* The event_type and event_id shoud not be observed before sli_event
* had been added to the list. We could guarantee the write order by
* smp_wmb(), and the reader could see the same order on the mainstream
* architecture(such as x86 and arm). But for some special architectures(
* such as DEC-alpha), it could break the data dependency relationship.
* So we add the READ_ONCE to maintain the date dependency even if in
* DEC-alpha architecture.
*/
new_event->event_type = READ_ONCE(event->event_type);
new_event->event_id = READ_ONCE(event->event_id);
switch (new_event->event_type) {
case SLI_SCHED_EVENT:
cgrp_event_monitor->schedlat_threshold[new_event->event_id] =
READ_ONCE(event_monitor->schedlat_threshold[new_event->event_id]);
cgrp_event_monitor->schedlat_count[new_event->event_id] =
READ_ONCE(event_monitor->schedlat_count[new_event->event_id]);
break;
case SLI_MEM_EVENT:
cgrp_event_monitor->memlat_threshold[new_event->event_id] =
READ_ONCE(event_monitor->memlat_threshold[new_event->event_id]);
cgrp_event_monitor->memlat_count[new_event->event_id] =
READ_ONCE(event_monitor->memlat_count[new_event->event_id]);
break;
case SLI_LONGTERM_EVENT:
cgrp_event_monitor->longterm_threshold[new_event->event_id] =
READ_ONCE(event_monitor->longterm_threshold[new_event->event_id]);
atomic_long_set(
&cgrp_event_monitor->longterm_statistics[new_event->event_id],
sli_get_longterm_statistics(cgrp, new_event->event_id));
break;
default:
printk(KERN_ERR "%s: invalid sli_event type!\n", __func__);
goto failed;
}
list_add(&new_event->event_node, &cgrp_event_monitor->event_head);
}
rcu_read_unlock();
cgrp_event_monitor->period = READ_ONCE(event_monitor->period);
cgrp_event_monitor->mbuf_enable = READ_ONCE(event_monitor->mbuf_enable);
return 0;
failed:
rcu_read_unlock();
/* Free memory from the event list */
list_for_each_entry_safe(event, event_tmp,
&cgrp_event_monitor->event_head, event_node) {
list_del(&event->event_node);
kfree(event);
}
return -1;
}
static void store_task_stack(struct task_struct *task, char *reason,
u64 duration, unsigned int skipnr)
{
unsigned long *entries;
unsigned nr_entries = 0;
unsigned long flags;
int i;
struct cgroup *cgrp;
entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
GFP_ATOMIC);
if (!entries)
return;
nr_entries = stack_trace_save_tsk(task, entries, MAX_STACK_TRACE_DEPTH, skipnr);
cgrp = get_cgroup_from_task(task);
spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
mbuf_print(cgrp, "record reason:%s comm:%s pid:%d duration=%lld\n",
reason, task->comm, task->pid, duration);
for (i = 0; i < nr_entries; i++)
mbuf_print(cgrp, "[<0>] %pB\n", (void *)entries[i]);
spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags);
kfree(entries);
return;
}
static char * get_memlat_name(enum sli_memlat_stat_item sidx)
{
char *name = NULL;
switch (sidx) {
case MEM_LAT_GLOBAL_DIRECT_RECLAIM:
name = "memlat_global_direct_reclaim";
break;
case MEM_LAT_MEMCG_DIRECT_RECLAIM:
name = "memlat_memcg_direct_reclaim";
break;
case MEM_LAT_DIRECT_COMPACT:
name = "memlat_direct_compact";
break;
case MEM_LAT_GLOBAL_DIRECT_SWAPOUT:
name = "memlat_global_direct_swapout";
break;
case MEM_LAT_MEMCG_DIRECT_SWAPOUT:
name = "memlat_memcg_direct_swapout";
break;
case MEM_LAT_DIRECT_SWAPIN:
name = "memlat_direct_swapin";
break;
default:
break;
}
return name;
}
static enum sli_lat_count get_lat_count_idx(u64 duration)
{
enum sli_lat_count idx;
duration = duration >> 20;
if (duration < 1)
idx = LAT_0_1;
else if (duration < 4)
idx = LAT_1_4;
else if (duration < 8)
idx = LAT_4_8;
else if (duration < 16)
idx = LAT_8_16;
else if (duration < 32)
idx = LAT_16_32;
else if (duration < 64)
idx = LAT_32_64;
else if (duration < 128)
idx = LAT_64_128;
else
idx = LAT_128_INF;
return idx;
}
static char * get_schedlat_name(enum sli_memlat_stat_item sidx)
{
char *name = NULL;
switch (sidx) {
case SCHEDLAT_WAIT:
name = "schedlat_wait";
break;
case SCHEDLAT_BLOCK:
name = "schedlat_block";
break;
case SCHEDLAT_IOBLOCK:
name = "schedlat_ioblock";
break;
case SCHEDLAT_SLEEP:
name = "schedlat_sleep";
break;
case SCHEDLAT_RUNDELAY:
name = "schedlat_rundelay";
break;
case SCHEDLAT_LONGSYS:
name = "schedlat_longsys";
break;
case SCHEDLAT_IRQTIME:
name = "schedlat_irqtime";
break;
default:
break;
}
return name;
}
static char *get_longterm_name(enum sli_longterm_event sidx)
{
char *name = NULL;
switch (sidx) {
case SLI_LONGTERM_RUNDELAY:
name = "longterm_rundelay";
break;
case SLI_LONGTERM_IRQTIME:
name = "longterm_irqtime";
break;
default:
break;
}
return name;
}
static u64 sli_memlat_stat_gather(struct cgroup *cgrp,
enum sli_memlat_stat_item sidx,
enum sli_lat_count cidx)
{
u64 sum = 0;
int cpu;
for_each_possible_cpu(cpu)
sum += per_cpu_ptr(cgrp->sli_memlat_stat_percpu, cpu)->item[sidx][cidx];
return sum;
}
int sli_memlat_stat_show(struct seq_file *m, struct cgroup *cgrp)
{
enum sli_memlat_stat_item sidx;
if (!static_branch_likely(&sli_enabled)) {
seq_printf(m, "sli is not enabled, please echo 1 > /proc/sli/sli_enabled\n");
return 0;
}
if (!cgrp->sli_memlat_stat_percpu)
return 0;
for (sidx = MEM_LAT_GLOBAL_DIRECT_RECLAIM;sidx < MEM_LAT_STAT_NR;sidx++) {
seq_printf(m, "%s:\n", get_memlat_name(sidx));
seq_printf(m, "0-1ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_0_1));
seq_printf(m, "1-4ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_1_4));
seq_printf(m, "4-8ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_4_8));
seq_printf(m, "8-16ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_8_16));
seq_printf(m, "16-32ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_16_32));
seq_printf(m, "32-64ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_32_64));
seq_printf(m, "64-128ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_64_128));
seq_printf(m, ">=128ms: %llu\n", sli_memlat_stat_gather(cgrp, sidx, LAT_128_INF));
}
return 0;
}
int sli_memlat_max_show(struct seq_file *m, struct cgroup *cgrp)
{
enum sli_memlat_stat_item sidx;
if (!static_branch_likely(&sli_enabled)) {
seq_printf(m, "sli is not enabled, please echo 1 > /proc/sli/sli_enabled\n");
return 0;
}
if (!cgrp->sli_memlat_stat_percpu)
return 0;
for (sidx = MEM_LAT_GLOBAL_DIRECT_RECLAIM; sidx < MEM_LAT_STAT_NR; sidx++) {
int cpu;
unsigned long latency_sum = 0;
for_each_possible_cpu(cpu)
latency_sum += per_cpu_ptr(cgrp->sli_memlat_stat_percpu, cpu)->latency_max[sidx];
seq_printf(m, "%s: %lu\n", get_memlat_name(sidx), latency_sum);
}
return 0;
}
void sli_memlat_stat_start(u64 *start)
{
if (!static_branch_likely(&sli_enabled))
*start = 0;
else
*start = local_clock();
}
void sli_memlat_stat_end(enum sli_memlat_stat_item sidx, u64 start)
{
struct mem_cgroup *memcg;
struct cgroup *cgrp;
if (!static_branch_likely(&sli_enabled) || start == 0)
return;
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
if (!memcg || memcg == root_mem_cgroup)
goto out;
cgrp = memcg->css.cgroup;
if (cgrp && cgroup_parent(cgrp)) {
enum sli_lat_count cidx;
u64 duration;
duration = local_clock() - start;
cidx = get_lat_count_idx(duration);
duration = duration >> 10;
this_cpu_inc(cgrp->sli_memlat_stat_percpu->item[sidx][cidx]);
this_cpu_add(cgrp->sli_memlat_stat_percpu->latency_max[sidx], duration);
if (static_branch_unlikely(&sli_monitor_enabled)) {
struct sli_event_monitor *event_monitor = cgrp->cgrp_event_monitor;
if (duration < READ_ONCE(event_monitor->memlat_threshold[sidx]))
goto out;
atomic_long_inc(&event_monitor->memlat_statistics[sidx]);
if (event_monitor->mbuf_enable) {
char *lat_name;
lat_name = get_memlat_name(sidx);
store_task_stack(current, lat_name, duration, 0);
}
}
}
out:
rcu_read_unlock();
}
void sli_schedlat_stat(struct task_struct *task, enum sli_schedlat_stat_item sidx, u64 delta)
{
struct cgroup *cgrp = NULL;
if (!static_branch_likely(&sli_enabled) || !task)
return;
rcu_read_lock();
cgrp = get_cgroup_from_task(task);
if (cgrp && cgroup_parent(cgrp)) {
enum sli_lat_count cidx = get_lat_count_idx(delta);
delta = delta >> 10;
this_cpu_inc(cgrp->sli_schedlat_stat_percpu->item[sidx][cidx]);
this_cpu_add(cgrp->sli_schedlat_stat_percpu->latency_max[sidx], delta);
if (static_branch_unlikely(&sli_monitor_enabled)) {
struct sli_event_monitor *event_monitor = cgrp->cgrp_event_monitor;
if (delta < READ_ONCE(event_monitor->schedlat_threshold[sidx]))
goto out;
atomic_long_inc(&event_monitor->schedlat_statistics[sidx]);
if (event_monitor->mbuf_enable) {
char *lat_name;
lat_name = get_schedlat_name(sidx);
store_task_stack(task, lat_name, delta, 0);
}
}
}
out:
rcu_read_unlock();
}
void sli_schedlat_rundelay(struct task_struct *task, struct task_struct *prev, u64 delta)
{
enum sli_schedlat_stat_item sidx = SCHEDLAT_RUNDELAY;
struct cgroup *cgrp = NULL;
if (!static_branch_likely(&sli_enabled) || !task || !prev)
return;
rcu_read_lock();
cgrp = get_cgroup_from_task(task);
if (cgrp && cgroup_parent(cgrp)) {
enum sli_lat_count cidx = get_lat_count_idx(delta);
delta = delta >> 10;
this_cpu_inc(cgrp->sli_schedlat_stat_percpu->item[sidx][cidx]);
this_cpu_add(cgrp->sli_schedlat_stat_percpu->latency_max[sidx], delta);
if (static_branch_unlikely(&sli_monitor_enabled)) {
struct sli_event_monitor *event_monitor = cgrp->cgrp_event_monitor;
if (delta < READ_ONCE(event_monitor->schedlat_threshold[sidx]))
goto out;
atomic_long_inc(&event_monitor->schedlat_statistics[sidx]);
if (event_monitor->mbuf_enable) {
int i;
unsigned long *entries;
unsigned int nr_entries = 0;
unsigned long flags;
entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
GFP_ATOMIC);
if (!entries)
goto out;
nr_entries = stack_trace_save_tsk(prev, entries,
MAX_STACK_TRACE_DEPTH, 0);
spin_lock_irqsave(&cgrp->cgrp_mbuf_lock, flags);
mbuf_print(cgrp, "record reason:schedlat_rundelay next_comm:%s "
"next_pid:%d prev_comm:%s prev_pid:%d duration=%lld\n",
task->comm, task->pid, prev->comm, prev->pid, delta);
for (i = 0; i < nr_entries; i++)
mbuf_print(cgrp, "[<0>] %pB\n", (void *)entries[i]);
spin_unlock_irqrestore(&cgrp->cgrp_mbuf_lock, flags);
kfree(entries);
}
}
}
out:
rcu_read_unlock();
}
void sli_check_longsys(struct task_struct *tsk)
{
long delta;
if (!static_branch_likely(&sli_enabled))
return;
if (tsk->sched_class != &fair_sched_class)
return ;
/* Longsys is performed only when TIF_RESCHED is set */
if (!test_tsk_need_resched(tsk))
return;
/* Kthread is not belong to any cgroup */
if (tsk->flags & PF_KTHREAD)
return;
if (!tsk->sched_info.kernel_exec_start ||
tsk->sched_info.task_switch != (tsk->nvcsw + tsk->nivcsw) ||
tsk->utime != tsk->sched_info.utime) {
tsk->sched_info.utime = tsk->utime;
tsk->sched_info.kernel_exec_start = rq_clock(task_rq(tsk));
tsk->sched_info.task_switch = tsk->nvcsw + tsk->nivcsw;
return;
}
delta = rq_clock(task_rq(tsk)) - tsk->sched_info.kernel_exec_start;
sli_schedlat_stat(tsk, SCHEDLAT_LONGSYS, delta);
}
static void sli_proactive_monitor_work(struct work_struct *work)
{
struct sli_event *event;
struct sli_notify_event *notify_event;
struct sli_event_monitor *event_monitor = container_of(work, struct sli_event_monitor,
sli_event_work);
notify_event = kzalloc(sizeof(struct sli_notify_event), GFP_KERNEL);
if (!notify_event)
return;
rcu_read_lock();
list_for_each_entry_rcu(event, &event_monitor->event_head, event_node) {
u64 statistics, last_statistics, threshold;
switch (event->event_type) {
case SLI_SCHED_EVENT:
statistics = (u64)atomic_long_read(
&event_monitor->schedlat_statistics[event->event_id]);
atomic_long_set(&event_monitor->schedlat_statistics[event->event_id], 0);
if (event_monitor->overrun) {
event_monitor->overrun = 0;
break;
}
if (statistics >= READ_ONCE(event_monitor->schedlat_count[event->event_id]))
sli_event_add(notify_event, event->event_type,
event->event_id, statistics);
break;
case SLI_MEM_EVENT:
statistics = (u64)atomic_long_read(
&event_monitor->memlat_statistics[event->event_id]);
atomic_long_set(&event_monitor->memlat_statistics[event->event_id], 0);
if (event_monitor->overrun) {
event_monitor->overrun = 0;
break;
}
if (statistics >= READ_ONCE(event_monitor->memlat_count[event->event_id]))
sli_event_add(notify_event, event->event_type,
event->event_id, statistics);
break;
case SLI_LONGTERM_EVENT:
statistics = sli_get_longterm_statistics(event_monitor->cgrp,
event->event_id);
last_statistics = atomic_long_read(
&event_monitor->longterm_statistics[event->event_id]);
atomic_long_set(&event_monitor->longterm_statistics[event->event_id],
statistics);
if (event_monitor->overrun) {
event_monitor->overrun = 0;
break;
}
threshold = READ_ONCE(event_monitor->longterm_threshold[event->event_id]);
/* Deal with time wrapping correclty */
if ((long)(statistics - last_statistics - threshold) >= 0)
sli_event_add(notify_event, event->event_type,
event->event_id, (int)(statistics - last_statistics));
break;
default:
break;
}
}
rcu_read_unlock();
/* Notify the userspace that the monitoring event had arrived */
sli_monitor_signal(event_monitor->cgrp, notify_event);
kfree(notify_event);
css_put(&event_monitor->cgrp->self);
}
void sli_update_tick(struct task_struct *tsk)
{
struct cgroup *cgrp;
if (!static_branch_likely(&sli_monitor_enabled))
return;
rcu_read_lock();
cgrp = get_cgroup_from_task(tsk);
if (cgrp && cgroup_parent(cgrp)) {
bool ret;
int period;
unsigned long long old_value, last_update;
period = cgrp->cgrp_event_monitor->period;
if (!period)
goto unlock;
retry:
last_update = READ_ONCE(cgrp->cgrp_event_monitor->last_update);
if (time_after((unsigned long)(period + last_update), jiffies))
goto unlock;
old_value = cmpxchg(&cgrp->cgrp_event_monitor->last_update,
last_update, jiffies);
if (old_value != last_update)
goto retry;
/*
* Current jiffies should be somewhere between period and 8 * period,
* otherwise we consider the it is overrun and should be abandoned.
*/
if (time_before((unsigned long)((period << 3) + last_update), jiffies))
cgrp->cgrp_event_monitor->overrun = 1;
rcu_read_unlock();
ret = css_tryget(&cgrp->self);
if (!ret)
return;
/*
* The sli trace work may have a lot a work to do, and should send
* the event to polling tasks. So we don't do the work in interrupt
* context(put the work to the workqueue).
*/
ret = queue_work(sli_workqueue, &cgrp->cgrp_event_monitor->sli_event_work);
/*
* If work had been pushed to workqueue and not been executed, there is no
* need to push it again. So we must put the css refcount.
*/
if (!ret)
css_put(&cgrp->self);
return;
}
unlock:
rcu_read_unlock();
}
static u64 sli_schedlat_stat_gather(struct cgroup *cgrp,
enum sli_schedlat_stat_item sidx,
enum sli_lat_count cidx)
{
u64 sum = 0;
int cpu;
for_each_possible_cpu(cpu)
sum += per_cpu_ptr(cgrp->sli_schedlat_stat_percpu, cpu)->item[sidx][cidx];
return sum;
}
int sli_schedlat_max_show(struct seq_file *m, struct cgroup *cgrp)
{
enum sli_schedlat_stat_item sidx;
if (!static_branch_likely(&sli_enabled)) {
seq_printf(m, "sli is not enabled, please echo 1 > /proc/sli/sli_enabled\n");
return 0;
}
if (!cgrp->sli_schedlat_stat_percpu)
return 0;
for (sidx = SCHEDLAT_WAIT; sidx < SCHEDLAT_STAT_NR; sidx++) {
int cpu;
unsigned long latency_sum = 0;
for_each_possible_cpu(cpu)
latency_sum += per_cpu_ptr(cgrp->sli_schedlat_stat_percpu, cpu)->latency_max[sidx];
seq_printf(m, "%s: %lu\n", get_schedlat_name(sidx), latency_sum);
}
return 0;
}
int sli_schedlat_stat_show(struct seq_file *m, struct cgroup *cgrp)
{
enum sli_schedlat_stat_item sidx;
if (!static_branch_likely(&sli_enabled)) {
seq_printf(m, "sli is not enabled, please echo 1 > /proc/sli/sli_enabled\n");
return 0;
}
if (!cgrp->sli_schedlat_stat_percpu)
return 0;
for (sidx = SCHEDLAT_WAIT;sidx < SCHEDLAT_STAT_NR;sidx++) {
seq_printf(m, "%s:\n", get_schedlat_name(sidx));
seq_printf(m, "0-1ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_0_1));
seq_printf(m, "1-4ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_1_4));
seq_printf(m, "4-8ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_4_8));
seq_printf(m, "8-16ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_8_16));
seq_printf(m, "16-32ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_16_32));
seq_printf(m, "32-64ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_32_64));
seq_printf(m, "64-128ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_64_128));
seq_printf(m, ">=128ms: %llu\n", sli_schedlat_stat_gather(cgrp, sidx, LAT_128_INF));
}
return 0;
}
static unsigned long sli_get_longterm_statistics(struct cgroup *cgrp,
enum sli_longterm_event event_id)
{
int cpu, index;
unsigned long latency_sum = 0;
index = SCHEDLAT_RUNDELAY + event_id;
for_each_possible_cpu(cpu)
latency_sum += READ_ONCE(per_cpu_ptr(cgrp->sli_schedlat_stat_percpu,
cpu)->latency_max[index]);
return latency_sum;
}
static inline int sli_parse_threshold(char *buf, struct sli_event_control *sec)
{
char *str;
int i, len, ret;
u64 value;
/* Replace the delimiter with '\0' */
len = strlen(buf);
for (i = 0; i < len; i++) {
if (buf[i] == ',' || buf[i] == ' ') {
buf[i] = '\0';
break;
}
}
if (i == len)
return -EINVAL;
/* Parse the value for theshold */
ret = kstrtou64(buf, 0, &value);
if (ret)
return ret;
sec->threshold = sli_convert_value(value, false);
/* Move the pointer to the positon which after the delimiter */
buf += (i + 1);
len -= (i + 1);
/* Parse the value for count(if it exist) */
str = strnstr(buf, "count=", len);
if (!str)
return -EINVAL;
str += strlen("count=");
ret = kstrtou64(str, 0, &value);
if (ret)
return ret;
sec->count = sli_convert_value(value, false);
return 0;
}
static int sli_parse_parameter(char *buf, int len, struct sli_event_control *sec, int index)
{
int i, min_len, ret = 0;
u64 value;
switch (index) {
case 0:
for (i = 0; i < ARRAY_SIZE(schedlat_theshold_name); i++) {
min_len = min(len, (int)strlen(schedlat_theshold_name[i]));
if (!strncmp(schedlat_theshold_name[i], buf, min_len))
break;
}
if (i == ARRAY_SIZE(schedlat_theshold_name))
return -EINVAL;
buf += min_len;
ret = sli_parse_threshold(buf, sec);
if (ret)
return ret;
sec->event_type = SLI_SCHED_EVENT;
sec->event_id = i;
break;
case 1:
for (i = 0; i < ARRAY_SIZE(memlat_threshold_name); i++) {
min_len = min(len, (int)strlen((const char *)memlat_threshold_name[i]));
if (!strncmp(memlat_threshold_name[i], buf, min_len))
break;
}
if (i == ARRAY_SIZE(memlat_threshold_name))
return -EINVAL;
buf += min_len;
ret = sli_parse_threshold(buf, sec);
if (ret)
return ret;
sec->event_type = SLI_MEM_EVENT;
sec->event_id = i;
break;
case 2:
for (i = 0; i < ARRAY_SIZE(longterm_threshold_name); i++) {
min_len = min(len, (int)strlen((const char *)longterm_threshold_name[i]));
if (!strncmp(longterm_threshold_name[i], buf, min_len))
break;
}
if (i == ARRAY_SIZE(longterm_threshold_name))
return -EINVAL;
buf += min_len;
ret = sli_parse_threshold(buf, sec);
if (ret)
return ret;
sec->event_type = SLI_LONGTERM_EVENT;
sec->event_id = i;
break;
case 3:
buf += strlen("period=");
ret = kstrtou64(buf, 0, &value);
if (ret)
return ret;
sec->period = usecs_to_jiffies(value);
break;
case 4:
buf += strlen("mbuf_enable=");
ret = kstrtou64(buf, 0, &value);
if (ret)
return ret;
sec->mbuf_enable = !!value;
break;
default:
return -EINVAL;
}
return 0;
}
static int sli_sanity_check(char *buf, struct sli_event_control *sec)
{
int i, len, min_len;
buf = strstrip(buf);
if (!buf)
return -EINVAL;
len = strlen(buf);
for (i = 0; i < ARRAY_SIZE(sanity_check_abbr); i++) {
min_len = min(len, (int)strlen(sanity_check_abbr[i]));
if (!strncmp(sanity_check_abbr[i], buf, min_len))
break;
}
/* The input string is not match with entries in the list */
if (i == ARRAY_SIZE(sanity_check_abbr))
return -EINVAL;
return sli_parse_parameter(buf, len, sec, i);
}
static int sli_event_update(struct sli_event_monitor *event_monitor,
struct sli_event_control *sec, u64 last_threshold)
{
struct sli_event *event;
/* Add the sli event */
if (last_threshold == ULLONG_MAX && sec->threshold != ULLONG_MAX) {
event = kmalloc(sizeof(struct sli_event), GFP_KERNEL);
if (!event)
return -ENOMEM;
event->event_type = sec->event_type;
event->event_id = sec->event_id;
/* event_type and event_id assignment should be done before add entry to list */
smp_wmb();
list_add_rcu(&event->event_node, &event_monitor->event_head);
} else if (last_threshold != ULLONG_MAX && sec->threshold == ULLONG_MAX) {
list_for_each_entry(event, &event_monitor->event_head, event_node) {
if (event->event_type != sec->event_type)
continue;
if (event->event_id != sec->event_id)
continue;
list_del_rcu(&event->event_node);
kfree_rcu(event, rcu);
break;
}
}
return 0;
}
ssize_t cgroup_sli_control_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
int ret;
struct cgroup *cgrp;
struct sli_event_monitor *event_monitor;
struct sli_event_control sec = {.event_type = -1, .period = -1, .mbuf_enable = -1,};
cgrp = of_css(of)->cgroup;
if (cgroup_parent(cgrp))
event_monitor = cgrp->cgrp_event_monitor;
else
event_monitor = &default_sli_event_monitor;
inode_lock(file_inode(of->file));
ret = sli_sanity_check(buf, &sec);
if (ret)
goto out;
if (sec.period != -1) {
if (!!event_monitor->period == !!sec.period) {
WRITE_ONCE(event_monitor->period, sec.period);
goto out;
}
WRITE_ONCE(event_monitor->period, sec.period);
if (cgroup_parent(cgrp) || event_monitor->mbuf_enable)
goto out;
if (sec.period)
static_branch_enable(&sli_monitor_enabled);
else
static_branch_disable(&sli_monitor_enabled);
goto out;
}
if (sec.mbuf_enable != -1) {
if (sec.mbuf_enable == event_monitor->mbuf_enable)
goto out;
WRITE_ONCE(event_monitor->mbuf_enable, sec.mbuf_enable);
if (cgroup_parent(cgrp) || event_monitor->period)
goto out;
if (sec.mbuf_enable)
static_branch_enable(&sli_monitor_enabled);
else
static_branch_disable(&sli_monitor_enabled);
goto out;
}
if (sec.event_type != -1) {
unsigned long long last_threshold;
switch (sec.event_type) {
case SLI_SCHED_EVENT:
last_threshold = event_monitor->schedlat_threshold[sec.event_id];
WRITE_ONCE(event_monitor->schedlat_threshold[sec.event_id], sec.threshold);
WRITE_ONCE(event_monitor->schedlat_count[sec.event_id], sec.count);
smp_wmb();
atomic_long_set(&event_monitor->schedlat_statistics[sec.event_id], 0);
ret = sli_event_update(event_monitor, &sec, last_threshold);
break;
case SLI_MEM_EVENT:
last_threshold = event_monitor->memlat_threshold[sec.event_id];
WRITE_ONCE(event_monitor->memlat_threshold[sec.event_id], sec.threshold);
WRITE_ONCE(event_monitor->memlat_count[sec.event_id], sec.count);
smp_wmb();
atomic_long_set(&event_monitor->memlat_statistics[sec.event_id], 0);
ret = sli_event_update(event_monitor, &sec, last_threshold);
break;
case SLI_LONGTERM_EVENT:
last_threshold = event_monitor->longterm_threshold[sec.event_id];
WRITE_ONCE(event_monitor->longterm_threshold[sec.event_id], sec.threshold);
smp_wmb();
if (cgroup_parent(cgrp))
atomic_long_set(&event_monitor->longterm_statistics[sec.event_id],
sli_get_longterm_statistics(cgrp, sec.event_id));
ret = sli_event_update(event_monitor, &sec, last_threshold);
break;
default:
break;
}
}
out:
if (!ret)
ret = nbytes;
inode_unlock(file_inode(of->file));
return ret;
}
int cgroup_sli_control_show(struct seq_file *sf, void *v)
{
int i;
unsigned long long threshold, count;
struct cgroup *cgrp;
struct sli_event_monitor *event_monitor;
cgrp = seq_css(sf)->cgroup;
if (cgroup_parent(cgrp))
event_monitor = cgrp->cgrp_event_monitor;
else
event_monitor = &default_sli_event_monitor;
seq_printf(sf, "period: %d\n", event_monitor->period);
seq_printf(sf, "mbuf_enable: %d\n", event_monitor->mbuf_enable);
for (i = 0; i < SCHEDLAT_STAT_NR; i++) {
threshold = sli_convert_value(event_monitor->schedlat_threshold[i], true);
count = sli_convert_value(event_monitor->schedlat_count[i], true);
seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_schedlat_name(i),
threshold, count);
}
for (i = 0; i < MEM_LAT_STAT_NR; i++) {
threshold = sli_convert_value(event_monitor->memlat_threshold[i], true);
count = sli_convert_value(event_monitor->memlat_count[i], true);
seq_printf(sf, "%s: threshold: %llu, count: %llu\n", get_memlat_name(i),
threshold, count);
}
for (i = 0; i < SLI_LONGTERM_NR; i++) {
threshold = sli_convert_value(event_monitor->longterm_threshold[i], true);
seq_printf(sf, "%s: threshold: %llu\n", get_longterm_name(i), threshold);
}
return 0;
}
/* sli monitor function*/
struct sli_notify_ctx* sctx_alloc(void)
{
struct sli_notify_ctx *sctx;
sctx = kzalloc(sizeof(struct sli_notify_ctx), GFP_KERNEL);
if (sctx) {
/* Do init work */
init_waitqueue_head(&sctx->wqh);
spin_lock_init(&sctx->notify_lock);
}
return sctx;
}
void sctx_free(struct cgroup *cgrp)
{
if (cgrp->sctx) {
kfree(cgrp->sctx);
cgrp->sctx = NULL;
}
}
static int sli_monitor_exchange(struct sli_notify_event *tnotify_event,
struct sli_notify_event *snotify_event)
{
memcpy(tnotify_event->notify_vector, snotify_event->notify_vector,
sizeof(struct sli_notify_event));
memset(snotify_event->notify_vector, 0, sizeof(struct sli_notify_event));
return 0;
}
int sli_monitor_open(struct kernfs_open_file *of)
{
struct file *filp = of->file;
int ret = 0;
filp->f_mode &= FMODE_READ;
if (!filp->f_mode & FMODE_READ)
ret = -EINVAL;
return ret;
}
static inline void notify_event_print(struct seq_file *seq,
struct sli_notify_event *notify_event,
u32 event_type, u32 levent_max)
{
int index;
u32 count;
for (index = 0; index < levent_max; index++) {
count = notify_event->notify_vector[event_type][index];
/*
* Only print when event count > 0, print format:
* event_type event_item event_count
*/
if (count > 0)
seq_printf(seq, "%u %u %u\n", event_type, index, count);
}
}
int sli_monitor_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct sli_notify_event notify_event;
unsigned long flags;
int i = 0;
if (cgrp && cgrp->sctx) {
spin_lock_irqsave(&cgrp->sctx->notify_lock, flags);
sli_monitor_exchange(&notify_event, &cgrp->sctx->notify_event);
spin_unlock_irqrestore(&cgrp->sctx->notify_lock, flags);
for (i = 0; i < SLI_EVENT_NR; i++)
notify_event_print(seq, &notify_event, i, SLI_ITEM_MAX);
}
return 0;
}
void *sli_monitor_start(struct seq_file *s, loff_t *pos)
{
return NULL + !*pos;
}
/* seq_next function is necessary for seq_read */
void *sli_monitor_next(struct seq_file *s, void *v, loff_t *pos)
{
return NULL;
}
void sli_monitor_stop(struct seq_file *seq, void *v)
{
/* must reset index, so next read can begin from 0 */
if (!seq->count)
seq->index = 0;
}
static inline bool is_notify_active(struct sli_notify_event *ne)
{
int index_e;
int index_i;
/* Any sli event count > 0 will be active */
for (index_e = 0; index_e < SLI_EVENT_NR; index_e++) {
for (index_i = 0; index_i < SLI_ITEM_MAX; index_i++)
if (ne->notify_vector[index_e][index_i] > 0)
return true;
}
return false;
}
__poll_t sli_monitor_poll(struct kernfs_open_file *of,
poll_table *pt)
{
struct cgroup *cgrp = of->kn->parent->priv;
struct file *filp = of->file;
struct sli_notify_ctx *sctx;
__poll_t events = 0;
bool active;
unsigned long flags;
sctx = cgrp->sctx;
if (!sctx) {
pr_err("sli:can not find sctx for cgroup [ %s ]", of->kn->name);
return -EINVAL;
}
poll_wait(filp, &sctx->wqh, pt);
/* Must hold notify_event lock */
spin_lock_irqsave(&cgrp->sctx->notify_lock, flags);
active = is_notify_active(&sctx->notify_event);
spin_unlock_irqrestore(&cgrp->sctx->notify_lock, flags);
if (active)
events |= EPOLLIN;
return events;
}
int sli_event_add(struct sli_notify_event *notify_event,
u32 event_type, u32 levent, u32 count)
{
int res = 0;
if (!notify_event) {
pr_err("sli: target notify_event is NULL\n");
res = -1;
goto end;
}
if (event_type >= SLI_EVENT_NR || levent > SLI_ITEM_MAX) {
pr_err("sli: invalid sli event type [ %u ] or sli item [ %u ]\n",
event_type, levent);
res = -1;
goto end;
}
notify_event->notify_vector[event_type][levent] = count;
end:
return res;
}
EXPORT_SYMBOL(sli_event_add);
u32 sli_monitor_signal(struct cgroup *cgrp, struct sli_notify_event *notify_event)
{
unsigned long flags;
struct sli_notify_ctx *sctx;
if (!cgrp->sctx) {
pr_err("sli:can not find notify info for cgroup:[ %s ]\n", cgrp->kn->name);
return 0;
}
sctx = cgrp->sctx;
spin_lock_irqsave(&sctx->notify_lock, flags);
sli_monitor_exchange(&cgrp->sctx->notify_event, notify_event);
spin_unlock_irqrestore(&sctx->notify_lock, flags);
if (waitqueue_active(&sctx->wqh))
wake_up_poll(&sctx->wqh, EPOLLIN);
return 0;
}
EXPORT_SYMBOL_GPL(sli_monitor_signal);
static int sli_enabled_show(struct seq_file *m, void *v)
{
seq_printf(m, "%d\n", static_key_enabled(&sli_enabled));
return 0;
}
static int sli_enabled_open(struct inode *inode, struct file *file)
{
return single_open(file, sli_enabled_show, NULL);
}
static ssize_t sli_enabled_write(struct file *file, const char __user *ubuf,
size_t count, loff_t *ppos)
{
char val = -1;
int ret = count;
if (count < 1 || *ppos) {
ret = -EINVAL;
goto out;
}
if (copy_from_user(&val, ubuf, 1)) {
ret = -EFAULT;
goto out;
}
switch (val) {
case '0':
if (static_key_enabled(&sli_enabled))
static_branch_disable(&sli_enabled);
break;
case '1':
if (!static_key_enabled(&sli_enabled))
static_branch_enable(&sli_enabled);
break;
default:
ret = -EINVAL;
}
out:
return ret;
}
static const struct file_operations sli_enabled_fops = {
.open = sli_enabled_open,
.read = seq_read,
.write = sli_enabled_write,
.llseek = seq_lseek,
.release = single_release,
};
int sli_cgroup_alloc(struct cgroup *cgroup)
{
if (!cgroup_need_sli(cgroup))
return 0;
spin_lock_init(&cgroup->cgrp_mbuf_lock);
cgroup->sli_memlat_stat_percpu = alloc_percpu(struct sli_memlat_stat);
if (!cgroup->sli_memlat_stat_percpu)
goto out;
cgroup->sli_schedlat_stat_percpu = alloc_percpu(struct sli_schedlat_stat);
if (!cgroup->sli_schedlat_stat_percpu)
goto free_memlat_percpu;
cgroup->cgrp_event_monitor = kzalloc(sizeof(struct sli_event_monitor), GFP_KERNEL);
if (!cgroup->cgrp_event_monitor)
goto free_schelat_percpu;
sli_event_monitor_init(cgroup->cgrp_event_monitor, cgroup);
if (sli_event_inherit(cgroup))
goto free_cgrp_event;
return 0;
free_cgrp_event:
kfree(cgroup->cgrp_event_monitor);
free_schelat_percpu:
free_percpu(cgroup->sli_schedlat_stat_percpu);
free_memlat_percpu:
free_percpu(cgroup->sli_memlat_stat_percpu);
out:
return -ENOMEM;
}
void sli_cgroup_free(struct cgroup *cgroup)
{
struct sli_event *event, *event_tmp;
/*
* Cgroup's subsys would be cleared before sli_cgroup_free() had been called.
* So we use !cgroup->cgrp_event_monitor instead of cgroup_need_sli to check
* whether the cgroup'smemory should be freed here.
*/
if (!cgroup->cgrp_event_monitor)
return;
free_percpu(cgroup->sli_memlat_stat_percpu);
free_percpu(cgroup->sli_schedlat_stat_percpu);
/* Free memory from the event list */
list_for_each_entry_safe(event, event_tmp,
&cgroup->cgrp_event_monitor->event_head, event_node) {
list_del(&event->event_node);
kfree(event);
}
kfree(cgroup->cgrp_event_monitor);
}
static int __init sli_proc_init(void)
{
sli_event_monitor_init(&default_sli_event_monitor, NULL);
sli_workqueue = alloc_workqueue("events_unbound", WQ_UNBOUND, WQ_UNBOUND_MAX_ACTIVE);
if (!sli_workqueue) {
printk(KERN_ERR "Create sli workqueue failed!\n");
return -1;
}
proc_mkdir("sli", NULL);
proc_create("sli/sli_enabled", 0, NULL, &sli_enabled_fops);
return 0;
}
late_initcall(sli_proc_init);