From 56d80c4ea2ec7c26f48b63980a025147fdcdc4e9 Mon Sep 17 00:00:00 2001 From: Honglin Li Date: Thu, 31 Aug 2023 15:38:34 +0800 Subject: [PATCH] rue/mm: add memory cgroup async page reclaim mechanism Introduce background page reclaim mechanism for memcg, it can be configured according to the cgroup priorities for different reclaim strategies. Signed-off-by: Yulei Zhang Signed-off-by: Mengmeng Chen Signed-off-by: Chunguang Xu Signed-off-by: Honglin Li --- .../admin-guide/cgroup-v1/memory.rst | 21 +- Documentation/admin-guide/cgroup-v2.rst | 64 ++++ include/linux/memcontrol.h | 6 + include/linux/page_counter.h | 9 + mm/memcontrol.c | 341 +++++++++++++++++- mm/page_counter.c | 28 ++ mm/vmscan.c | 32 +- 7 files changed, 494 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index ff456871bf4b..2234f9a1203c 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -109,6 +109,13 @@ Brief summary of control files. memory.kmem.tcp.failcnt show the number of tcp buf memory usage hits limits memory.kmem.tcp.max_usage_in_bytes show max tcp buf memory usage recorded + memory.async_ratio ratio setting for async reclaim wmark + memory.async_high high limit for start async reclaim + memory.async_low low limit to strop async reclaim + memory.async_distance_factor the distance between async_high and async_low, valid + value is from 1 to 150000, the unit is in fractions + of 1000000 + ==================================== ========================================== 1. History @@ -971,7 +978,19 @@ Test: (Expect a bunch of notifications, and eventually, the oom-killer will trigger.) -12. TODO +12. Async reclaim +================= + +Add memory usage water mark for async reclaim, memory.async_ratio. Valid +value is from 0 to 100, which represents percentage of memory.limit, the +actual value of percentage * memory.limit will be asigned to memory.async_high. +When charge pages to memory cgroup, it will schedule a work to reclaim +pages when the memory usage exceed the memory.async_high, and the work +will stop when the reclaim reachs memory.async_low. memory.async_ratio = 0 +means async reclaim is disabled, both async_low and async_high +would be max, which is the default value. + +13. TODO ======== 1. Make per-cgroup scanner reclaim not-shared pages first diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b26b5274eaaf..58b6bb89440f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1275,6 +1275,70 @@ PAGE_SIZE multiple when read back. The max memory usage recorded for the cgroup and its descendants since the creation of the cgroup. + memory.async_ratio + A read-write single value file which exists on non-root + cgroups. The default is 0. + + Memory usage water mark for async reclaim. Valid value is from 0 + to 100, which represents percentage of memory.high, the actual value + of percentage * memory.high will be asigned to async_high. When + charge pages to memory cgroup, it will schedule a work to reclaim + pages when the memory usage exceed the async_high. + + 0 means async reclaim is disabled, both async_low and async_high + would be max, which is the default value, and the mechanism takes + effect. + + memory.async_high + A read-only single value file which exists on non-root cgroups. + The default is max. + + Memory usage high water mark for async reclaim, if a cgroup's + memory usage exceeds this limit will trigger the async reclaim + logic. + + memory.async_low + A read-only single value file which exists on non-root cgroups. + The default is max. + + Memory usage low water mark for async reclaim, if a reclaim work + is scheduled, it will try to reclaim (async_high - async_low) + pages. + + memory.async_distance_factor + A read-write single value file which exists on non-root cgroups. + The default is 1. + + Define the distance between async_high and async_low. Valid value is + from 1 to 150000, the unit is in fractions of 1000000. The default value of + 1 means the distance between async_high and async_low is 0.01% of memory.high + of the cgroup. The maximum value is 150000, which 15% of memory.high. + + memory.async_ratio_delta + A read-write single value file which exists on non-root + cgroups. The default is -1. + + Memory usage watermark calculation factor for async reclaim. Valid + value is from 1 to 10, which will be used to calculate the memory.async_ratio + relay on cgroup priority, async_ratio = 100 - (MAX_PRIORITY - cgroup_prio) * + async_ratio_delta, and the result will be used to calculate the + async reclaim wmark. + + Default value -1 is a show stop value which means the async reclaim wmark + calculation will not relay on the cgroup priority, it only depends on the + async_ratio value we manully set. + + memory.async_distance_delta + A read-write single value file which exists on non-root cgroups. + The default is 1. + + Per cgroup prioirty factor to define the distance between async_high + and async_low. Valid value is from 1 to 50, The formula is + async_distance_factor = async_distance_delta * (cgroup_priority_value + 1). + Async reclaim mechanism will use the result for reclaim distance + cacluation. And when async_ratio_delta == -1, this value won't + take effect either when cgroup priority changes. + memory.oom.group A read-write single value file which exists on non-root cgroups. The default value is "0". diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 23a70be60c3c..6acda1a95807 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -338,6 +338,12 @@ struct mem_cgroup { int reclaim_failed; struct list_head prio_list; struct list_head prio_list_async; + /* per cgroup memory async reclaim */ + unsigned int async_wmark; + unsigned int async_distance_factor; + int async_wmark_delta; + unsigned int async_distance_delta; + struct work_struct async_work; /* * set > 0 if pages under this cgroup are moving to other cgroup. diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c141ea9a95ef..948f7d90e47c 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -25,6 +25,10 @@ struct page_counter { atomic_long_t low_usage; atomic_long_t children_low_usage; + /* async reclaim threshold */ + unsigned long async_low; + unsigned long async_high; + unsigned long watermark; unsigned long failcnt; @@ -76,6 +80,11 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); +void page_counter_set_async_high(struct page_counter *counter, + unsigned long nr_pages); +void page_counter_set_async_low(struct page_counter *counter, + unsigned long nr_pages); + static inline void page_counter_reset_watermark(struct page_counter *counter) { counter->watermark = page_counter_read(counter); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51c18a1e519c..c74899005708 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -125,6 +125,11 @@ static atomic_long_t memcg_reclaimed_count; static unsigned long memcg_reclaim_goal; static int memcg_cur_reclaim_prio = CGROUP_PRIORITY_MAX; static DEFINE_SPINLOCK(memcg_reclaim_prio_lock); +/* workqueue for async reclaim */ +struct workqueue_struct *memcg_async_reclaim_wq; +#define ASYNC_DISTANCE_DIV 1000000 +#define ASYNC_RATIO_DIV 100 +#define ASYNC_DISTANCE_DEF 1 /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) @@ -1875,6 +1880,10 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT) + memcg_events(memcg, PGSTEAL_KHUGEPAGED)); + seq_buf_printf(s, "pgscan_in_background %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD)); + seq_buf_printf(s, "pgsteal_in_background %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD)); for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { if (memcg_vm_event_stat[i] == PGPGIN || @@ -2652,6 +2661,32 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu) return 0; } +static bool need_memcg_async_reclaim(struct mem_cgroup *memcg) +{ + if (!sysctl_vm_memory_qos) + return false; + + return page_counter_read(&memcg->memory) > memcg->memory.async_high; +} + +static void async_reclaim_func(struct work_struct *work) +{ + struct mem_cgroup *memcg; + unsigned long nr_pages; + + memcg = container_of(work, struct mem_cgroup, async_work); + nr_pages = page_counter_read(&memcg->memory) - memcg->memory.async_low; + + if (nr_pages <= 0) + return; + + nr_pages = min(nr_pages, + (memcg->memory.async_high - memcg->memory.async_low)); + memcg_memory_event(memcg, MEMCG_HIGH); + try_to_free_mem_cgroup_pages(memcg, nr_pages, GFP_KERNEL, true); + +} + static unsigned long reclaim_high(struct mem_cgroup *memcg, unsigned int nr_pages, gfp_t gfp_mask) @@ -2915,6 +2950,47 @@ out: css_put(&memcg->css); } +static void setup_async_wmark(struct mem_cgroup *memcg) +{ + unsigned long high_throttle, low_throttle, distance; + unsigned long high = cgroup_subsys_on_dfl(memory_cgrp_subsys) ? + memcg->memory.high : memcg->memory.max; + + if (memcg->async_wmark) { + high_throttle = (memcg->async_wmark * high) / ASYNC_RATIO_DIV; + distance = mult_frac(high, + memcg->async_distance_factor, ASYNC_DISTANCE_DIV); + if (distance >= high_throttle) + low_throttle = memcg->memory.low; + else + low_throttle = high_throttle - distance; + } else { + high_throttle = PAGE_COUNTER_MAX; + low_throttle = PAGE_COUNTER_MAX; + } + page_counter_set_async_high(&memcg->memory, high_throttle); + page_counter_set_async_low(&memcg->memory, low_throttle); +} + +static void async_reclaim_reset_factor(struct mem_cgroup *memcg, + unsigned int new_prio) +{ + unsigned int wmark, distance; + + if (memcg->async_wmark_delta < 0) + return; + + wmark = ASYNC_RATIO_DIV - + (CGROUP_PRIORITY_MAX - new_prio) * memcg->async_wmark_delta; + xchg(&memcg->async_wmark, wmark); + distance = memcg->async_distance_delta * (new_prio + 1); + xchg(&memcg->async_distance_factor, distance); + + setup_async_wmark(memcg); + if (need_memcg_async_reclaim(memcg)) + queue_work(memcg_async_reclaim_wq, &memcg->async_work); +} + static struct task_struct *memcg_priod; static struct task_struct *memcg_priod_async; static DECLARE_WAIT_QUEUE_HEAD(memcg_prio_reclaim_wq); @@ -3052,6 +3128,7 @@ int mem_cgroup_notify_prio_change(struct cgroup_subsys_state *css, { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + async_reclaim_reset_factor(memcg, new_prio); return memcg_notify_prio_change(memcg, old_prio, new_prio); } @@ -3239,6 +3316,12 @@ done_restock: do { bool mem_high, swap_high; + if (need_memcg_async_reclaim(memcg)) { + /* Kick off per memory cgroup async reclaim */ + queue_work(memcg_async_reclaim_wq, &memcg->async_work); + break; + } + mem_high = page_counter_read(&memcg->memory) > READ_ONCE(memcg->memory.high); swap_high = page_counter_read(&memcg->swap) > @@ -3972,8 +4055,14 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, } } while (true); - if (!ret && enlarge) - memcg_oom_recover(memcg); + if (!ret) { + setup_async_wmark(memcg); + if (need_memcg_async_reclaim(memcg)) + queue_work(memcg_async_reclaim_wq, &memcg->async_work); + + if (enlarge) + memcg_oom_recover(memcg); + } return ret; } @@ -4175,6 +4264,8 @@ enum { RES_MAX_USAGE, RES_FAILCNT, RES_SOFT_LIMIT, + ASYNC_HIGH_LIMIT, + ASYNC_LOW_LIMIT, }; static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, @@ -4215,6 +4306,10 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return counter->failcnt; case RES_SOFT_LIMIT: return (u64)READ_ONCE(memcg->soft_limit) * PAGE_SIZE; + case ASYNC_HIGH_LIMIT: + return (u64)counter->async_high * PAGE_SIZE; + case ASYNC_LOW_LIMIT: + return (u64)counter->async_low * PAGE_SIZE; default: BUG(); } @@ -4656,6 +4751,11 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) seq_buf_printf(s, "file_cost %lu\n", file_cost); } #endif + + seq_buf_printf(s, "pgscan_in_background %lu\n", + memcg_events(memcg, PGSCAN_KSWAPD)); + seq_buf_printf(s, "pgsteal_in_background %lu\n", + memcg_events(memcg, PGSTEAL_KSWAPD)); } #ifdef CONFIG_TEXT_UNEVICTABLE @@ -5898,6 +5998,74 @@ static int mem_cgroup_unevictable_percent_write(struct cgroup_subsys_state *css, } #endif +static int memory_async_reclaim_wmark_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->async_wmark)); + + return 0; +} + +static ssize_t memory_async_reclaim_wmark_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, wmark; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &wmark); + if (ret) + return ret; + + if (wmark > 100) + return -EINVAL; + + xchg(&memcg->async_wmark, wmark); + + setup_async_wmark(memcg); + if (need_memcg_async_reclaim(memcg)) + queue_work(memcg_async_reclaim_wq, &memcg->async_work); + + return nbytes; +} + +static int memory_async_distance_factor_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->async_distance_factor)); + + return 0; +} + +static ssize_t memory_async_distance_factor_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, factor; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &factor); + if (ret) + return ret; + + if ((factor > 150000) || (factor < 1)) + return -EINVAL; + + xchg(&memcg->async_distance_factor, factor); + + setup_async_wmark(memcg); + + return nbytes; +} + static int memory_oom_group_show(struct seq_file *m, void *v); static ssize_t memory_oom_group_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off); @@ -5972,6 +6140,30 @@ static struct cftype mem_cgroup_legacy_files[] = { .seq_show = memory_oom_group_show, .write = memory_oom_group_write, }, + { + .name = "async_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_reclaim_wmark_show, + .write = memory_async_reclaim_wmark_write, + }, + { + .name = "async_high", + .flags = CFTYPE_NOT_ON_ROOT, + .private = MEMFILE_PRIVATE(_MEM, ASYNC_HIGH_LIMIT), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "async_low", + .flags = CFTYPE_NOT_ON_ROOT, + .private = MEMFILE_PRIVATE(_MEM, ASYNC_LOW_LIMIT), + .read_u64 = mem_cgroup_read_u64, + }, + { + .name = "async_distance_factor", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_distance_factor_show, + .write = memory_async_distance_factor_write, + }, { .name = "cgroup.event_control", /* XXX: for compat */ .write = memcg_write_event_control, @@ -6352,6 +6544,7 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent) goto fail; INIT_WORK(&memcg->high_work, high_work_func); + INIT_WORK(&memcg->async_work, async_reclaim_func); INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); @@ -6413,6 +6606,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) #endif WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent)); WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable)); + memcg->async_wmark = parent->async_wmark; + memcg->async_distance_factor = parent->async_distance_factor ? + : ASYNC_DISTANCE_DEF; + memcg->async_wmark_delta = parent->async_wmark_delta; + memcg->async_distance_delta = parent->async_distance_delta ? + : ASYNC_DISTANCE_DEF; #ifdef CONFIG_MEMCG_ZRAM memcg->zram_prio = parent->zram_prio; #endif @@ -6426,7 +6625,12 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + } + setup_async_wmark(memcg); + + if (!parent) { + memcg->async_wmark_delta = -1; root_mem_cgroup = memcg; return &memcg->css; } @@ -6483,6 +6687,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); spin_unlock(&memcg_idr_lock); + async_reclaim_reset_factor(memcg, memcg_get_prio(memcg)); memcg_notify_prio_change(memcg, 0, memcg_get_prio(memcg)); return 0; @@ -6514,6 +6719,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX); memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); @@ -6557,6 +6764,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css) vmpressure_cleanup(&memcg->vmpressure); cancel_work_sync(&memcg->high_work); + cancel_work_sync(&memcg->async_work); mem_cgroup_remove_from_trees(memcg); free_shrinker_info(memcg); mem_cgroup_free(memcg); @@ -6585,6 +6793,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); + page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_async_low(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); @@ -7587,6 +7797,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, break; } + setup_async_wmark(memcg); + if (need_memcg_async_reclaim(memcg)) + queue_work(memcg_async_reclaim_wq, &memcg->async_work); + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -7640,6 +7854,10 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, break; } + setup_async_wmark(memcg); + if (need_memcg_async_reclaim(memcg)) + queue_work(memcg_async_reclaim_wq, &memcg->async_work); + memcg_wb_domain_size_changed(memcg); return nbytes; } @@ -7817,6 +8035,84 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, return nbytes; } +static int memory_async_high_wmark_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.async_high)); +} + +static int memory_async_low_wmark_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->memory.async_low)); +} + +static int memory_async_distance_delta_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->async_distance_delta)); + + return 0; +} + +static ssize_t memory_async_distance_delta_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, delta; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &delta); + if (ret) + return ret; + + if ((delta > 50) || (delta < 1)) + return -EINVAL; + + xchg(&memcg->async_distance_delta, delta); + + async_reclaim_reset_factor(memcg, memcg_get_prio(memcg)); + + return nbytes; +} + +static int memory_async_wmark_delta_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + + seq_printf(m, "%d\n", READ_ONCE(memcg->async_wmark_delta)); + + return 0; +} + +static ssize_t memory_async_wmark_delta_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, delta; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &delta); + if (ret) + return ret; + + if (((delta > 10) || (delta < 1)) && (delta != -1)) + return -EINVAL; + + xchg(&memcg->async_wmark_delta, delta); + + async_reclaim_reset_factor(memcg, memcg_get_prio(memcg)); + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -7890,6 +8186,40 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NS_DELEGATABLE, .write = memory_reclaim, }, + { + .name = "async_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_reclaim_wmark_show, + .write = memory_async_reclaim_wmark_write, + }, + { + .name = "async_high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_high_wmark_show, + }, + { + .name = "async_low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_low_wmark_show, + }, + { + .name = "async_distance_factor", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_distance_factor_show, + .write = memory_async_distance_factor_write, + }, + { + .name = "async_ratio_delta", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_wmark_delta_show, + .write = memory_async_wmark_delta_write, + }, + { + .name = "async_distance_delta", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_async_distance_delta_show, + .write = memory_async_distance_delta_write, + }, { } /* terminate */ }; @@ -8479,6 +8809,13 @@ static int __init mem_cgroup_init(void) { int cpu, node; + memcg_async_reclaim_wq = alloc_workqueue("memcg_async_reclaim", + WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_FREEZABLE, + WQ_UNBOUND_MAX_ACTIVE); + + if (!memcg_async_reclaim_wq) + return -ENOMEM; + /* * Currently s32 type (can refer to struct batched_lruvec_stat) is * used for per-memcg-per-cpu caching of per-node statistics. In order diff --git a/mm/page_counter.c b/mm/page_counter.c index db20d6452b71..bea3bcb8f0e1 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -234,6 +234,34 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) propagate_protected_usage(c, atomic_long_read(&c->usage)); } +/** + * page_counter_set_async_high - set the start throttle of memory for + * memcg async reclaim + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_async_high(struct page_counter *counter, + unsigned long nr_pages) +{ + xchg(&counter->async_high, nr_pages); +} + +/** + * page_counter_set_async_low - set the stop throttle of memory for + * memcg async reclaim + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_async_low(struct page_counter *counter, + unsigned long nr_pages) +{ + xchg(&counter->async_low, nr_pages); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse diff --git a/mm/vmscan.c b/mm/vmscan.c index 9665ac677eb4..dd7061c8b495 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1163,6 +1163,24 @@ static int reclaimer_offset(void) return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; } +static int rue_reclaimer_offset(struct scan_control *sc) +{ + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD != + PGSCAN_DIRECT - PGSCAN_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD); + BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD != + PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD); + + if (current_is_kswapd() || (cgroup_reclaim(sc) && current_work())) + return 0; + if (current_is_khugepaged()) + return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD; + return PGSTEAL_DIRECT - PGSTEAL_KSWAPD; +} + static inline int is_page_cache_freeable(struct folio *folio) { /* @@ -2676,7 +2694,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); - item = PGSCAN_KSWAPD + reclaimer_offset(); + item = PGSCAN_KSWAPD + rue_reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_scanned); __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned); @@ -2693,7 +2711,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); - item = PGSTEAL_KSWAPD + reclaimer_offset(); + item = PGSTEAL_KSWAPD + rue_reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, nr_reclaimed); __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed); @@ -5200,7 +5218,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, break; } - item = PGSCAN_KSWAPD + reclaimer_offset(); + item = PGSCAN_KSWAPD + rue_reclaimer_offset(sc); if (!cgroup_reclaim(sc)) { __count_vm_events(item, isolated); __count_vm_events(PGREFILL, sorted); @@ -5390,7 +5408,7 @@ retry: if (walk && walk->batched) reset_batch_size(lruvec, walk); - item = PGSTEAL_KSWAPD + reclaimer_offset(); + item = PGSTEAL_KSWAPD + rue_reclaimer_offset(sc); if (!cgroup_reclaim(sc)) __count_vm_events(item, reclaimed); __count_memcg_events(memcg, item, reclaimed); @@ -6717,6 +6735,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); + if (cgroup_reclaim(sc) && + ((sc->nr_reclaimed >= sc->nr_to_reclaim))) { + mem_cgroup_iter_break(target_memcg, memcg); + break; + } + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); }