rue/mm: introduce new feature to async clean dying memcgs

When memcg was removed, page caches and slab pages still reference to this memcg, it will cause very large number of dying memcgs in out system. This feature can async to clean dying memcgs in system. 1) sysctl -w vm.clean_dying_memcg_async=1 #start a kthread to async clean dying memcgs, default #value is 0. 2) sysctl -w vm.clean_dying_memcg_threshold=10 #Whenever 10 dying memcgs are generated in the system, #wakeup a kthread to async clean dying memcgs, default #value is 100. Signed-off-by: Bin Lai <robinlai@tencent.com> Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com> Signed-off-by: Honglin Li <honglinli@tencent.com>
2023-09-05 19:38:41 +08:00 · 2023-09-05 19:38:41 +08:00 · b82ababba6
parent 200560da23
commit b82ababba6
4 changed files with 235 additions and 2 deletions
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@ -31,6 +31,12 @@ struct mm_struct;
 struct kmem_cache;
 struct oom_control;

+extern int kclean_dying_memcg_run(void);
+extern unsigned int sysctl_clean_dying_memcg_threshold;
+extern void kclean_dying_memcg_stop(void);
+extern void wakeup_kclean_dying_memcg(void);
+extern atomic_long_t dying_memcgs_count;
+
 /* Cgroup-specific page state, on top of universal node page state */
 enum memcg_stat_item {
 	MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS,
@ -205,6 +211,8 @@ struct memcg_cgwb_frn {
 	struct wb_completion done;	/* tracks in-flight foreign writebacks */
 };

+void drain_all_stock(struct mem_cgroup *root_memcg);
+
 /*
 * Bucket for arbitrarily byte-sized objects charged to a memory
 * cgroup. The bucket can be reparented in one piece when the cgroup
@ -241,6 +249,7 @@ struct mem_cgroup {
 		struct page_counter memsw;	/* v1 only */
 	};

+	unsigned long offline_times;
 	struct page_counter pagecache;
 	u64 pagecache_reclaim_ratio;
 	u32 pagecache_max_ratio;
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@ -68,6 +68,7 @@

 #include <linux/uaccess.h>
 #include <asm/processor.h>
+#include <linux/memcontrol.h>
 #include <linux/rue.h>

 #ifdef CONFIG_X86
@ -99,6 +100,7 @@ extern int sysctl_qos_mbuf_enable;
 extern int sysctl_vm_memory_qos;
 extern int sysctl_vm_qos_highest_reclaim_prio;
 extern unsigned int sysctl_vm_qos_prio_reclaim_ratio;
+extern unsigned int sysctl_clean_dying_memcg_async;
 extern void memory_qos_update(void);
 extern int memory_qos_prio_reclaim_ratio_update(void);
 static int vm_lowest_prio = CGROUP_PRIORITY_MAX;
@ -2028,6 +2030,50 @@ int memory_qos_sysctl_prio_reclaim_ratio_handler(struct ctl_table *table,

 	return 0;
 }
+
+static int clean_dying_memcg_async_handler(struct ctl_table *table, int write,
+				void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (write && (sysctl_vm_memory_qos == 0 ||
+			 sysctl_clean_dying_memcg_threshold == 0))
+		return -EINVAL;
+
+	if (write && !ret) {
+		if (sysctl_clean_dying_memcg_async > 0) {
+			if (kclean_dying_memcg_run()) {
+				sysctl_clean_dying_memcg_async = 0;
+				return -EINVAL;
+			}
+		} else
+			kclean_dying_memcg_stop();
+	}
+
+	return ret;
+}
+
+static int clean_dying_memcg_threshold_handler(struct ctl_table *table,
+		int write, void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	unsigned int old_val = sysctl_clean_dying_memcg_threshold;
+	int ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos);
+
+	if (write && (sysctl_vm_memory_qos == 0))
+		return -EINVAL;
+
+	if (write && !ret) {
+		if (old_val != sysctl_clean_dying_memcg_threshold) {
+			if (atomic_long_read(&dying_memcgs_count) >=
+					sysctl_clean_dying_memcg_threshold) {
+				atomic_long_set(&dying_memcgs_count, 0);
+				wakeup_kclean_dying_memcg();
+			}
+		}
+	}
+
+	return ret;
+}
 #endif

 #ifdef CONFIG_RPS
@ -2972,6 +3018,22 @@ static struct ctl_table vm_table[] = {
 		.extra1			= SYSCTL_ONE,
 		.extra2			= &twenty,
 	},
+	{
+		.procname		= "clean_dying_memcg_async",
+		.data			= &sysctl_clean_dying_memcg_async,
+		.maxlen			= sizeof(sysctl_clean_dying_memcg_async),
+		.mode			= 0644,
+		.proc_handler	= clean_dying_memcg_async_handler,
+		.extra1			= SYSCTL_ZERO,
+		.extra2			= SYSCTL_ONE,
+	},
+	{
+		.procname		= "clean_dying_memcg_threshold",
+		.data			= &sysctl_clean_dying_memcg_threshold,
+		.maxlen			= sizeof(sysctl_clean_dying_memcg_threshold),
+		.mode			= 0644,
+		.proc_handler	= clean_dying_memcg_threshold_handler,
+	},
 #endif
 	{ }
 };
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -121,6 +121,11 @@ int sysctl_vm_memory_qos;
 /* default has none reclaim priority */
 int sysctl_vm_qos_highest_reclaim_prio = CGROUP_PRIORITY_MAX;

+unsigned int sysctl_clean_dying_memcg_async;
+unsigned int sysctl_clean_dying_memcg_threshold = 100;
+static struct task_struct *kclean_dying_memcg;
+DECLARE_WAIT_QUEUE_HEAD(kclean_dying_memcg_wq);
+
 static unsigned long rmem_wmark_limit;
 static unsigned long rmem_wmark_setpoint;
 static unsigned long rmem_wmark_freerun;
@ -2607,7 +2612,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
 * Drains all per-CPU charge caches for given root_memcg resp. subtree
 * of the hierarchy under it.
 */
-static void drain_all_stock(struct mem_cgroup *root_memcg)
+void drain_all_stock(struct mem_cgroup *root_memcg)
 {
 	int cpu, curcpu;

@ -7107,11 +7112,44 @@ remove_id:
 	return -ENOMEM;
 }

+atomic_long_t dying_memcgs_count;
+
+void wakeup_kclean_dying_memcg(void)
+{
+	if (!waitqueue_active(&kclean_dying_memcg_wq)) /* .. */
+		return;
+
+	wake_up_interruptible(&kclean_dying_memcg_wq);
+}
+
+void charge_dying_memcgs(struct mem_cgroup *memcg)
+{
+	if (sysctl_vm_memory_qos == 0)
+		return;
+
+	if (sysctl_clean_dying_memcg_async == 0)
+		return;
+
+	if (sysctl_clean_dying_memcg_threshold == 0)
+		return;
+
+	if (atomic_long_read(&dying_memcgs_count) >=
+			sysctl_clean_dying_memcg_threshold) {
+		atomic_long_set(&dying_memcgs_count, 0);
+		wakeup_kclean_dying_memcg();
+	}
+
+	memcg->offline_times = jiffies;
+	atomic_long_add(1, &dying_memcgs_count);
+}
+
 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup_event *event, *tmp;

+	charge_dying_memcgs(memcg);
+
 	/* XXX no direct number */
 	memcg_notify_prio_change(memcg, memcg_get_prio(memcg), 0);
 	/*
@ -10085,3 +10123,127 @@ static int memcg_prio_reclaimd_run(void)

 	return ret;
 }
+
+extern unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+				struct mem_cgroup *memcg,
+				 int priority);
+
+void reap_slab(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *parent;
+
+	/*
+	 * Offline memcg's kmem_cache had been moved to its parent memcg.
+	 * so we must shrink its parent memcg.
+	 */
+	parent = parent_mem_cgroup(memcg);
+	if (parent) {
+		int nid;
+		unsigned long freed, count;
+
+		for_each_online_node(nid) {
+			freed = count = 0;
+
+			do {
+				count++;
+				freed = shrink_slab(GFP_KERNEL, nid, parent, 0);
+			} while (freed > 10 && count < 10);
+		}
+	}
+}
+
+static void clean_each_dying_memcg(struct mem_cgroup *memcg)
+{
+	unsigned long current_pages;
+	int drained = 0;
+	unsigned int jiff_dirty_exp = HZ * dirty_expire_interval / 100;
+
+	if ((memcg_page_state(memcg, NR_WRITEBACK) +
+			memcg_page_state(memcg, NR_FILE_DIRTY))
+			&& time_after(memcg->offline_times +
+					jiff_dirty_exp, jiffies)) {
+		return;
+	}
+
+	current_pages = page_counter_read(&memcg->memory);
+	while (current_pages) {
+		unsigned int ret;
+
+		ret = try_to_free_mem_cgroup_pages(memcg, current_pages,
+							GFP_KERNEL, true);
+		if (ret)
+			goto next;
+
+		reap_slab(memcg);
+
+		if (!drained) {
+			drain_all_stock(memcg);
+			drained = 1;
+		} else
+			break;
+next:
+		current_pages = page_counter_read(&memcg->memory);
+	}
+}
+
+static void clean_all_dying_memcgs(void)
+{
+	struct mem_cgroup *memcg;
+
+	for_each_mem_cgroup_tree(memcg, NULL) {
+		if (!mem_cgroup_online(memcg))
+			clean_each_dying_memcg(memcg);
+
+		cond_resched();
+	}
+}
+
+static int kclean_dying_memcgs(void *data)
+{
+	DEFINE_WAIT(wait);
+
+	if (waitqueue_active(&kclean_dying_memcg_wq)) /* .. */
+		wake_up_interruptible(&kclean_dying_memcg_wq);
+
+	for ( ; ; ) {
+		clean_all_dying_memcgs();
+		prepare_to_wait(&kclean_dying_memcg_wq,
+					&wait, TASK_INTERRUPTIBLE);
+
+		if (!kthread_should_stop())
+			schedule();
+		else {
+			finish_wait(&kclean_dying_memcg_wq, &wait);
+			break;
+		}
+		finish_wait(&kclean_dying_memcg_wq, &wait);
+	}
+
+	return 0;
+}
+
+int kclean_dying_memcg_run(void)
+{
+	int ret = 0;
+
+	if (kclean_dying_memcg)
+		return 0;
+
+	kclean_dying_memcg = kthread_run(kclean_dying_memcgs,
+					NULL, "kclean_dying_memcgs");
+	if (IS_ERR(kclean_dying_memcg)) {
+		pr_err("Failed to start kclean_dying_memcgs kthread.\n");
+		ret = PTR_ERR(kclean_dying_memcgs);
+		kclean_dying_memcg = NULL;
+	}
+
+	return ret;
+}
+
+void kclean_dying_memcg_stop(void)
+{
+	if (kclean_dying_memcg) {
+		kthread_stop(kclean_dying_memcg);
+		kclean_dying_memcg = NULL;
+	}
+}
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@ -1086,7 +1086,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 *
 * Returns the number of reclaimed slab objects.
 */
-static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 				 struct mem_cgroup *memcg,
 				 int priority)
 {