rue/mm: introduce memory allocation latency for per-cgroup tool

A new memory.latency_histogram control file is added under each memory cgroup directory. Cat this file can print the memory access latency at the memory cgroup level. Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com> Signed-off-by: Honglin Li <honglinli@tencent.com>
2023-09-05 15:08:34 +08:00 · 2023-09-05 15:08:34 +08:00 · 8de07be077
parent 1824581599
commit 8de07be077
4 changed files with 101 additions and 0 deletions
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@ -22,6 +22,8 @@
 #include <linux/writeback.h>
 #include <linux/page-flags.h>

+#define MEM_LATENCY_MAX_SLOTS 64
+
 struct mem_cgroup;
 struct obj_cgroup;
 struct page;
@ -341,6 +343,8 @@ struct mem_cgroup {

 	CACHELINE_PADDING(_pad2_);

+	u64 __percpu *latency_histogram[MEM_LATENCY_MAX_SLOTS];
+
 	int reclaim_failed;
 	struct list_head	prio_list;
 	struct list_head	prio_list_async;
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@ -2615,6 +2615,7 @@ static struct ctl_table kern_table[] = {
 	{ }
 };

+unsigned int vm_memcg_latency_histogram;
 unsigned long vm_pagecache_system_usage;

 static struct ctl_table vm_table[] = {
@ -2925,6 +2926,15 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0444,
 		.proc_handler	= proc_pagecache_system_usage,
 	},
+	{
+		.procname	= "memcg_latency_histogram",
+		.data		= &vm_memcg_latency_histogram,
+		.maxlen		= sizeof(vm_memcg_latency_histogram),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
 	{
 		.procname		= "memory_qos",
 		.data			= &sysctl_vm_memory_qos,
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -6322,11 +6322,52 @@ static ssize_t memory_async_distance_factor_write(struct kernfs_open_file *of,
 	return nbytes;
 }

+extern unsigned int vm_memcg_latency_histogram;
+
+static int mem_cgroup_lat_seq_show(struct seq_file *m, void *v)
+{
+	u64 sum_lat;
+	int i, cpu;
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	if (!sysctl_vm_memory_qos) {
+		seq_puts(m, "vm.memory_qos is not enabled.\n");
+		return 0;
+	}
+
+	if (!vm_memcg_latency_histogram) {
+		seq_puts(m, "vm.memcg_latency_histogram is not enabled.\n");
+		return 0;
+	}
+
+	for (i = 0; i < MEM_LATENCY_MAX_SLOTS; i++) {
+		sum_lat = 0;
+
+		for_each_possible_cpu(cpu) {
+			sum_lat += *per_cpu_ptr(memcg->latency_histogram[i], cpu);
+			*per_cpu_ptr(memcg->latency_histogram[i], cpu) = 0;
+		}
+		if (i == 0)
+			seq_printf(m, "[%-20llu, %-20llu]ns : %llu.\n",
+				   (u64)0, (u64)1, sum_lat);
+		else
+			seq_printf(m, "[%-20llu, %-20llu]ns : %llu.\n",
+				   (u64)1 << (i - 1),
+				   (u64)1 << i, sum_lat);
+	}
+
+	return 0;
+}
+
 static int memory_oom_group_show(struct seq_file *m, void *v);
 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off);

 static struct cftype mem_cgroup_legacy_files[] = {
+	{
+		.name = "latency_histogram",
+		.seq_show = mem_cgroup_lat_seq_show,
+	},
 	{
 		.name = "usage_in_bytes",
 		.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@ -6756,6 +6797,10 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
+	int i;
+
+	for (i = 0; i < MEM_LATENCY_MAX_SLOTS; i++)
+		free_percpu(memcg->latency_histogram[i]);

 	for_each_node(node)
 		free_mem_cgroup_per_node_info(memcg, node);
@ -6853,6 +6898,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
 	struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
 	struct mem_cgroup *memcg, *old_memcg;
+	long error = -ENOMEM;
+	int index;

 	old_memcg = set_active_memcg(parent);
 	memcg = mem_cgroup_alloc(parent);
@ -6862,6 +6909,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
 	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
+	for (index = 0; index < MEM_LATENCY_MAX_SLOTS; index++) {
+		memcg->latency_histogram[index] = alloc_percpu(u64);
+		if (!memcg->latency_histogram[index])
+			goto fail;
+	}
 	memcg->pagecache_reclaim_ratio = DEFAULT_PAGE_RECLAIM_RATIO;
 	memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
@ -6924,6 +6976,10 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 	INIT_LIST_HEAD(&memcg->prio_list_async);

 	return &memcg->css;
+fail:
+	mem_cgroup_id_remove(memcg);
+	mem_cgroup_free(memcg);
+	return ERR_PTR(error);
 }

 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@ -56,6 +56,8 @@
 #ifdef CONFIG_CGROUP_SLI
 #include <linux/sli.h>
 #endif
+#include <linux/log2.h>
+#include <linux/sched/clock.h>

 #include "internal.h"
 #include "shuffle.h"
@ -4429,6 +4431,8 @@ failed:
 }
 EXPORT_SYMBOL_GPL(__alloc_pages_bulk);

+extern unsigned int vm_memcg_latency_histogram;
+
 /*
 * This is the 'heart' of the zoned buddy allocator.
 */
@ -4439,6 +4443,12 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 	unsigned int alloc_flags = ALLOC_WMARK_LOW;
 	gfp_t alloc_gfp; /* The gfp_t that was actually used for allocation */
 	struct alloc_context ac = { };
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+	u64 start_ns;
+	u64 delta;
+	int delta_log;
+#endif

 	/*
 	 * There are several places where we assume that the order value is sane
@ -4461,6 +4471,16 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
 			&alloc_gfp, &alloc_flags))
 		return NULL;

+#ifdef CONFIG_MEMCG
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(current);
+	if (sysctl_vm_memory_qos && vm_memcg_latency_histogram && memcg)
+		start_ns = local_clock();
+	if (memcg)
+		css_get(&memcg->css);
+	rcu_read_unlock();
+#endif
+
 	/*
 	 * Forbid the first pass from falling back to types that fragment
 	 * memory until all local zones are considered.
@ -4490,6 +4510,17 @@ out:
 		page = NULL;
 	}

+#ifdef CONFIG_MEMCG
+	if (sysctl_vm_memory_qos && vm_memcg_latency_histogram && memcg) {
+		delta = local_clock() - start_ns;
+		delta_log = __ilog2_u64(delta);
+		if (delta_log < 0)
+			delta_log = 0;
+		this_cpu_add(*memcg->latency_histogram[delta_log], 1);
+	}
+	mem_cgroup_put(memcg);
+#endif
+
 	trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype);
 	kmsan_alloc_page(page, order, alloc_gfp);