rue/mm: add memory cgroup async page reclaim mechanism

Introduce background page reclaim mechanism for memcg, it can be configured according to the cgroup priorities for different reclaim strategies. Signed-off-by: Yulei Zhang <yuleixzhang@tencent.com> Signed-off-by: Mengmeng Chen <bauerchen@tencent.com> Signed-off-by: Chunguang Xu <brookxu@tencent.com> Signed-off-by: Honglin Li <honglinli@tencent.com>
2023-08-31 15:38:34 +08:00 · 2023-08-31 15:38:34 +08:00 · 56d80c4ea2
parent 0d35c4c639
commit 56d80c4ea2
7 changed files with 494 additions and 7 deletions
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@ -109,6 +109,13 @@ Brief summary of control files.
 memory.kmem.tcp.failcnt             show the number of tcp buf memory usage
 				     hits limits
 memory.kmem.tcp.max_usage_in_bytes  show max tcp buf memory usage recorded
+ memory.async_ratio                  ratio setting for async reclaim wmark
+ memory.async_high                   high limit for start async reclaim
+ memory.async_low                    low limit to strop async reclaim
+ memory.async_distance_factor        the distance between async_high and async_low, valid
+                                     value is from 1 to 150000, the unit is in fractions
+                                     of 1000000
+
 ==================================== ==========================================

 1. History
@ -971,7 +978,19 @@ Test:
   (Expect a bunch of notifications, and eventually, the oom-killer will
   trigger.)

-12. TODO
+12. Async reclaim
+=================
+
+Add memory usage water mark for async reclaim, memory.async_ratio. Valid
+value is from 0 to 100, which represents percentage of memory.limit, the
+actual value of percentage * memory.limit will be asigned to memory.async_high.
+When charge pages to memory cgroup, it will schedule a work to reclaim
+pages when the memory usage exceed the memory.async_high, and the work
+will stop when the reclaim reachs memory.async_low. memory.async_ratio = 0
+means async reclaim is disabled, both async_low and async_high
+would be max, which is the default value.
+
+13. TODO
 ========

 1. Make per-cgroup scanner reclaim not-shared pages first
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@ -1275,6 +1275,70 @@ PAGE_SIZE multiple when read back.
 	The max memory usage recorded for the cgroup and its
 	descendants since the creation of the cgroup.

+  memory.async_ratio
+	A read-write single value file which exists on non-root
+	cgroups.  The default is 0.
+
+	Memory usage water mark for async reclaim. Valid value is from 0
+	to 100, which represents percentage of memory.high, the actual value
+	of percentage * memory.high will be asigned to async_high. When
+	charge pages to memory cgroup, it will schedule a work to reclaim
+	pages when the memory usage exceed the async_high.
+
+	0 means async reclaim is disabled, both async_low and async_high
+	would be max, which is the default value, and the mechanism takes
+	effect.
+
+  memory.async_high
+	A read-only single value file which exists on non-root cgroups.
+	The default is max.
+
+	Memory usage high water mark for async reclaim, if a cgroup's
+	memory usage exceeds this limit will trigger the async reclaim
+	logic.
+
+  memory.async_low
+	A read-only single value file which exists on non-root cgroups.
+	The default is max.
+
+	Memory usage low water mark for async reclaim, if a reclaim work
+	is scheduled, it will try to reclaim (async_high - async_low)
+	pages.
+
+  memory.async_distance_factor
+	A read-write single value file which exists on non-root cgroups.
+	The default is 1.
+
+	Define the distance between async_high and async_low.  Valid value is
+	from 1 to 150000, the unit is in fractions of 1000000. The default value of
+	1 means the distance between async_high and async_low is 0.01% of memory.high
+	of the cgroup.  The maximum value is 150000, which 15% of memory.high.
+
+  memory.async_ratio_delta
+	A read-write single value file which exists on non-root
+	cgroups.  The default is -1.
+
+	Memory usage watermark calculation factor for async reclaim. Valid
+	value is from 1	to 10, which will be used to calculate the memory.async_ratio
+	relay on cgroup priority, async_ratio = 100 - (MAX_PRIORITY - cgroup_prio) *
+	async_ratio_delta, and the result will be used to calculate the
+	async reclaim wmark.
+
+	Default value -1 is a show stop value which means the async reclaim wmark
+	calculation will not relay on the cgroup priority, it only depends on the
+	async_ratio value we manully set.
+
+  memory.async_distance_delta
+	A read-write single value file which exists on non-root cgroups.
+	The default is 1.
+
+	Per cgroup prioirty factor to define the distance between async_high
+	and async_low.  Valid value is from 1 to 50, The formula is
+	async_distance_factor = async_distance_delta * (cgroup_priority_value + 1).
+	Async reclaim mechanism will use the result for reclaim distance
+	cacluation. And when async_ratio_delta == -1, this value won't
+	take effect either when cgroup priority changes.
+
  memory.oom.group
 	A read-write single value file which exists on non-root
 	cgroups.  The default value is "0".
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@ -338,6 +338,12 @@ struct mem_cgroup {
 	int reclaim_failed;
 	struct list_head	prio_list;
 	struct list_head	prio_list_async;
+	/* per cgroup memory async reclaim */
+	unsigned int		async_wmark;
+	unsigned int		async_distance_factor;
+	int			async_wmark_delta;
+	unsigned int		async_distance_delta;
+	struct work_struct	async_work;

 	/*
 	 * set > 0 if pages under this cgroup are moving to other cgroup.
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@ -25,6 +25,10 @@ struct page_counter {
 	atomic_long_t low_usage;
 	atomic_long_t children_low_usage;

+	/* async reclaim threshold */
+	unsigned long async_low;
+	unsigned long async_high;
+
 	unsigned long watermark;
 	unsigned long failcnt;

@ -76,6 +80,11 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
 int page_counter_memparse(const char *buf, const char *max,
 			  unsigned long *nr_pages);

+void page_counter_set_async_high(struct page_counter *counter,
+				 unsigned long nr_pages);
+void page_counter_set_async_low(struct page_counter *counter,
+				unsigned long nr_pages);
+
 static inline void page_counter_reset_watermark(struct page_counter *counter)
 {
 	counter->watermark = page_counter_read(counter);
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -125,6 +125,11 @@ static atomic_long_t memcg_reclaimed_count;
 static unsigned long memcg_reclaim_goal;
 static int memcg_cur_reclaim_prio = CGROUP_PRIORITY_MAX;
 static DEFINE_SPINLOCK(memcg_reclaim_prio_lock);
+/* workqueue for async reclaim */
+struct workqueue_struct *memcg_async_reclaim_wq;
+#define ASYNC_DISTANCE_DIV	1000000
+#define ASYNC_RATIO_DIV		100
+#define ASYNC_DISTANCE_DEF	1

 /* Whether legacy memory+swap accounting is active */
 static bool do_memsw_account(void)
@ -1875,6 +1880,10 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 		       memcg_events(memcg, PGSTEAL_KSWAPD) +
 		       memcg_events(memcg, PGSTEAL_DIRECT) +
 		       memcg_events(memcg, PGSTEAL_KHUGEPAGED));
+	seq_buf_printf(s, "pgscan_in_background %lu\n",
+		       memcg_events(memcg, PGSCAN_KSWAPD));
+	seq_buf_printf(s, "pgsteal_in_background %lu\n",
+		       memcg_events(memcg, PGSTEAL_KSWAPD));

 	for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
 		if (memcg_vm_event_stat[i] == PGPGIN ||
@ -2652,6 +2661,32 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
 	return 0;
 }

+static bool need_memcg_async_reclaim(struct mem_cgroup *memcg)
+{
+	if (!sysctl_vm_memory_qos)
+		return false;
+
+	return page_counter_read(&memcg->memory) > memcg->memory.async_high;
+}
+
+static void async_reclaim_func(struct work_struct *work)
+{
+	struct mem_cgroup *memcg;
+	unsigned long nr_pages;
+
+	memcg = container_of(work, struct mem_cgroup, async_work);
+	nr_pages = page_counter_read(&memcg->memory) - memcg->memory.async_low;
+
+	if (nr_pages <= 0)
+		return;
+
+	nr_pages = min(nr_pages,
+			(memcg->memory.async_high - memcg->memory.async_low));
+	memcg_memory_event(memcg, MEMCG_HIGH);
+	try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true);
+
+}
+
 static unsigned long reclaim_high(struct mem_cgroup *memcg,
 				  unsigned int nr_pages,
 				  gfp_t gfp_mask)
@ -2915,6 +2950,47 @@ out:
 	css_put(&memcg->css);
 }

+static void setup_async_wmark(struct mem_cgroup *memcg)
+{
+	unsigned long high_throttle, low_throttle, distance;
+	unsigned long high = cgroup_subsys_on_dfl(memory_cgrp_subsys) ?
+				memcg->memory.high : memcg->memory.max;
+
+	if (memcg->async_wmark) {
+		high_throttle = (memcg->async_wmark * high) / ASYNC_RATIO_DIV;
+		distance = mult_frac(high,
+			memcg->async_distance_factor, ASYNC_DISTANCE_DIV);
+		if (distance >= high_throttle)
+			low_throttle = memcg->memory.low;
+		else
+			low_throttle = high_throttle - distance;
+	} else {
+		high_throttle = PAGE_COUNTER_MAX;
+		low_throttle = PAGE_COUNTER_MAX;
+	}
+	page_counter_set_async_high(&memcg->memory, high_throttle);
+	page_counter_set_async_low(&memcg->memory, low_throttle);
+}
+
+static void async_reclaim_reset_factor(struct mem_cgroup *memcg,
+					unsigned int new_prio)
+{
+	unsigned int wmark, distance;
+
+	if (memcg->async_wmark_delta < 0)
+		return;
+
+	wmark = ASYNC_RATIO_DIV -
+		(CGROUP_PRIORITY_MAX - new_prio) * memcg->async_wmark_delta;
+	xchg(&memcg->async_wmark, wmark);
+	distance = memcg->async_distance_delta * (new_prio + 1);
+	xchg(&memcg->async_distance_factor, distance);
+
+	setup_async_wmark(memcg);
+	if (need_memcg_async_reclaim(memcg))
+		queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+}
+
 static struct task_struct *memcg_priod;
 static struct task_struct *memcg_priod_async;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_prio_reclaim_wq);
@ -3052,6 +3128,7 @@ int mem_cgroup_notify_prio_change(struct cgroup_subsys_state *css,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);

+	async_reclaim_reset_factor(memcg, new_prio);
 	return memcg_notify_prio_change(memcg, old_prio, new_prio);
 }

@ -3239,6 +3316,12 @@ done_restock:
 	do {
 		bool mem_high, swap_high;

+		if (need_memcg_async_reclaim(memcg)) {
+			/* Kick off per memory cgroup async reclaim */
+			queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+			break;
+		}
+
 		mem_high = page_counter_read(&memcg->memory) >
 			READ_ONCE(memcg->memory.high);
 		swap_high = page_counter_read(&memcg->swap) >
@ -3972,8 +4055,14 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
 		}
 	} while (true);

-	if (!ret && enlarge)
-		memcg_oom_recover(memcg);
+	if (!ret) {
+		setup_async_wmark(memcg);
+		if (need_memcg_async_reclaim(memcg))
+			queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+
+		if (enlarge)
+			memcg_oom_recover(memcg);
+	}

 	return ret;
 }
@ -4175,6 +4264,8 @@ enum {
 	RES_MAX_USAGE,
 	RES_FAILCNT,
 	RES_SOFT_LIMIT,
+	ASYNC_HIGH_LIMIT,
+	ASYNC_LOW_LIMIT,
 };

 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
@ -4215,6 +4306,10 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
 		return counter->failcnt;
 	case RES_SOFT_LIMIT:
 		return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE;
+	case ASYNC_HIGH_LIMIT:
+		return (u64)counter->async_high * PAGE_SIZE;
+	case ASYNC_LOW_LIMIT:
+		return (u64)counter->async_low * PAGE_SIZE;
 	default:
 		BUG();
 	}
@ -4656,6 +4751,11 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
 		seq_buf_printf(s, "file_cost %lu\n", file_cost);
 	}
 #endif
+
+	seq_buf_printf(s, "pgscan_in_background %lu\n",
+		       memcg_events(memcg, PGSCAN_KSWAPD));
+	seq_buf_printf(s, "pgsteal_in_background %lu\n",
+		       memcg_events(memcg, PGSTEAL_KSWAPD));
 }

 #ifdef CONFIG_TEXT_UNEVICTABLE
@ -5898,6 +5998,74 @@ static int mem_cgroup_unevictable_percent_write(struct cgroup_subsys_state *css,
 }
 #endif

+static int memory_async_reclaim_wmark_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", READ_ONCE(memcg->async_wmark));
+
+	return 0;
+}
+
+static ssize_t memory_async_reclaim_wmark_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, wmark;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &wmark);
+	if (ret)
+		return ret;
+
+	if (wmark > 100)
+		return -EINVAL;
+
+	xchg(&memcg->async_wmark, wmark);
+
+	setup_async_wmark(memcg);
+	if (need_memcg_async_reclaim(memcg))
+		queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+
+	return nbytes;
+}
+
+static int memory_async_distance_factor_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", READ_ONCE(memcg->async_distance_factor));
+
+	return 0;
+}
+
+static ssize_t memory_async_distance_factor_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, factor;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &factor);
+	if (ret)
+		return ret;
+
+	if ((factor > 150000) || (factor < 1))
+		return -EINVAL;
+
+	xchg(&memcg->async_distance_factor, factor);
+
+	setup_async_wmark(memcg);
+
+	return nbytes;
+}
+
 static int memory_oom_group_show(struct seq_file *m, void *v);
 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
 				      char *buf, size_t nbytes, loff_t off);
@ -5972,6 +6140,30 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.seq_show = memory_oom_group_show,
 		.write = memory_oom_group_write,
 	},
+	{
+		.name = "async_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_reclaim_wmark_show,
+		.write = memory_async_reclaim_wmark_write,
+	},
+	{
+		.name = "async_high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = MEMFILE_PRIVATE(_MEM, ASYNC_HIGH_LIMIT),
+		.read_u64 = mem_cgroup_read_u64,
+	},
+	{
+		.name = "async_low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = MEMFILE_PRIVATE(_MEM, ASYNC_LOW_LIMIT),
+		.read_u64 = mem_cgroup_read_u64,
+	},
+	{
+		.name = "async_distance_factor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_distance_factor_show,
+		.write = memory_async_distance_factor_write,
+	},
 	{
 		.name = "cgroup.event_control",		/* XXX: for compat */
 		.write = memcg_write_event_control,
@ -6352,6 +6544,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
 		goto fail;

 	INIT_WORK(&memcg->high_work, high_work_func);
+	INIT_WORK(&memcg->async_work, async_reclaim_func);
 	INIT_LIST_HEAD(&memcg->oom_notify);
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
@ -6413,6 +6606,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #endif
 		WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
 		WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
+		memcg->async_wmark = parent->async_wmark;
+		memcg->async_distance_factor = parent->async_distance_factor ?
+						: ASYNC_DISTANCE_DEF;
+		memcg->async_wmark_delta = parent->async_wmark_delta;
+		memcg->async_distance_delta = parent->async_distance_delta ?
+						: ASYNC_DISTANCE_DEF;
 #ifdef CONFIG_MEMCG_ZRAM
 		memcg->zram_prio = parent->zram_prio;
 #endif
@ -6426,7 +6625,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->swap, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 		page_counter_init(&memcg->tcpmem, NULL);
+	}

+	setup_async_wmark(memcg);
+
+	if (!parent) {
+		memcg->async_wmark_delta = -1;
 		root_mem_cgroup = memcg;
 		return &memcg->css;
 	}
@ -6483,6 +6687,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
 	idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
 	spin_unlock(&memcg_idr_lock);

+	async_reclaim_reset_factor(memcg, memcg_get_prio(memcg));
 	memcg_notify_prio_change(memcg, 0, memcg_get_prio(memcg));

 	return 0;
@ -6514,6 +6719,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)

 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
+	page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX);
+	page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX);

 	memcg_offline_kmem(memcg);
 	reparent_shrinker_deferred(memcg);
@ -6557,6 +6764,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)

 	vmpressure_cleanup(&memcg->vmpressure);
 	cancel_work_sync(&memcg->high_work);
+	cancel_work_sync(&memcg->async_work);
 	mem_cgroup_remove_from_trees(memcg);
 	free_shrinker_info(memcg);
 	mem_cgroup_free(memcg);
@ -6585,6 +6793,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
+	page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX);
+	page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX);
 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
 	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
 	page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
@ -7587,6 +7797,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 			break;
 	}

+	setup_async_wmark(memcg);
+	if (need_memcg_async_reclaim(memcg))
+		queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+
 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
@ -7640,6 +7854,10 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 			break;
 	}

+	setup_async_wmark(memcg);
+	if (need_memcg_async_reclaim(memcg))
+		queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+
 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
 }
@ -7817,6 +8035,84 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
 	return nbytes;
 }

+static int memory_async_high_wmark_show(struct seq_file *m, void *v)
+{
+	return seq_puts_memcg_tunable(m,
+		READ_ONCE(mem_cgroup_from_seq(m)->memory.async_high));
+}
+
+static int memory_async_low_wmark_show(struct seq_file *m, void *v)
+{
+	return seq_puts_memcg_tunable(m,
+		READ_ONCE(mem_cgroup_from_seq(m)->memory.async_low));
+}
+
+static int memory_async_distance_delta_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", READ_ONCE(memcg->async_distance_delta));
+
+	return 0;
+}
+
+static ssize_t memory_async_distance_delta_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, delta;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &delta);
+	if (ret)
+		return ret;
+
+	if ((delta > 50) || (delta < 1))
+		return -EINVAL;
+
+	xchg(&memcg->async_distance_delta, delta);
+
+	async_reclaim_reset_factor(memcg, memcg_get_prio(memcg));
+
+	return nbytes;
+}
+
+static int memory_async_wmark_delta_show(struct seq_file *m, void *v)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+	seq_printf(m, "%d\n", READ_ONCE(memcg->async_wmark_delta));
+
+	return 0;
+}
+
+static ssize_t memory_async_wmark_delta_write(struct kernfs_open_file *of,
+				      char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	int ret, delta;
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtoint(buf, 0, &delta);
+	if (ret)
+		return ret;
+
+	if (((delta > 10) || (delta < 1)) && (delta != -1))
+		return -EINVAL;
+
+	xchg(&memcg->async_wmark_delta, delta);
+
+	async_reclaim_reset_factor(memcg, memcg_get_prio(memcg));
+
+	return nbytes;
+}
+
 static struct cftype memory_files[] = {
 	{
 		.name = "current",
@ -7890,6 +8186,40 @@ static struct cftype memory_files[] = {
 		.flags = CFTYPE_NS_DELEGATABLE,
 		.write = memory_reclaim,
 	},
+	{
+		.name = "async_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_reclaim_wmark_show,
+		.write = memory_async_reclaim_wmark_write,
+	},
+	{
+		.name = "async_high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_high_wmark_show,
+	},
+	{
+		.name = "async_low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_low_wmark_show,
+	},
+	{
+		.name = "async_distance_factor",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_distance_factor_show,
+		.write = memory_async_distance_factor_write,
+	},
+	{
+		.name = "async_ratio_delta",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_wmark_delta_show,
+		.write = memory_async_wmark_delta_write,
+	},
+	{
+		.name = "async_distance_delta",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = memory_async_distance_delta_show,
+		.write = memory_async_distance_delta_write,
+	},
 	{ }	/* terminate */
 };

@ -8479,6 +8809,13 @@ static int __init mem_cgroup_init(void)
 {
 	int cpu, node;

+	memcg_async_reclaim_wq = alloc_workqueue("memcg_async_reclaim",
+				WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_FREEZABLE,
+				WQ_UNBOUND_MAX_ACTIVE);
+
+	if (!memcg_async_reclaim_wq)
+		return -ENOMEM;
+
 	/*
 	 * Currently s32 type (can refer to struct batched_lruvec_stat) is
 	 * used for per-memcg-per-cpu caching of per-node statistics. In order
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@ -234,6 +234,34 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
 		propagate_protected_usage(c, atomic_long_read(&c->usage));
 }

+/**
+ * page_counter_set_async_high - set the start throttle of memory for
+ * memcg async reclaim
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_async_high(struct page_counter *counter,
+				 unsigned long nr_pages)
+{
+	xchg(&counter->async_high, nr_pages);
+}
+
+/**
+ * page_counter_set_async_low - set the stop throttle of memory for
+ * memcg async reclaim
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_async_low(struct page_counter *counter,
+				unsigned long nr_pages)
+{
+	xchg(&counter->async_low, nr_pages);
+}
+
 /**
 * page_counter_memparse - memparse() for page counter limits
 * @buf: string to parse
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@ -1163,6 +1163,24 @@ static int reclaimer_offset(void)
 	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
 }

+static int rue_reclaimer_offset(struct scan_control *sc)
+{
+	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+			PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+	BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+			PGSCAN_DIRECT - PGSCAN_KSWAPD);
+	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+			PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+	BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+			PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
+
+	if (current_is_kswapd() || (cgroup_reclaim(sc) && current_work()))
+		return 0;
+	if (current_is_khugepaged())
+		return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+	return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
+}
+
 static inline int is_page_cache_freeable(struct folio *folio)
 {
 	/*
@ -2676,7 +2694,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 				     &nr_scanned, sc, lru);

 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
-	item = PGSCAN_KSWAPD + reclaimer_offset();
+	item = PGSCAN_KSWAPD + rue_reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, nr_scanned);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@ -2693,7 +2711,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
 	move_folios_to_lru(lruvec, &folio_list);

 	__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
-	item = PGSTEAL_KSWAPD + reclaimer_offset();
+	item = PGSTEAL_KSWAPD + rue_reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, nr_reclaimed);
 	__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
@ -5200,7 +5218,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
 			break;
 	}

-	item = PGSCAN_KSWAPD + reclaimer_offset();
+	item = PGSCAN_KSWAPD + rue_reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc)) {
 		__count_vm_events(item, isolated);
 		__count_vm_events(PGREFILL, sorted);
@ -5390,7 +5408,7 @@ retry:
 	if (walk && walk->batched)
 		reset_batch_size(lruvec, walk);

-	item = PGSTEAL_KSWAPD + reclaimer_offset();
+	item = PGSTEAL_KSWAPD + rue_reclaimer_offset(sc);
 	if (!cgroup_reclaim(sc))
 		__count_vm_events(item, reclaimed);
 	__count_memcg_events(memcg, item, reclaimed);
@ -6717,6 +6735,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 				   sc->nr_scanned - scanned,
 				   sc->nr_reclaimed - reclaimed);

+		if (cgroup_reclaim(sc) &&
+		    ((sc->nr_reclaimed >= sc->nr_to_reclaim))) {
+			mem_cgroup_iter_break(target_memcg, memcg);
+			break;
+		}
+
 	} while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }