diff --git a/arch/riscv/configs/tencent.config b/arch/riscv/configs/tencent.config index 88bc831f447e..776fe6f23db7 100644 --- a/arch/riscv/configs/tencent.config +++ b/arch/riscv/configs/tencent.config @@ -140,7 +140,6 @@ CONFIG_DAMON_SYSFS=y CONFIG_DAMON_DBGFS=y CONFIG_DAMON_RECLAIM=y CONFIG_DAMON_LRU_SORT=y -CONFIG_PAGECACHE_LIMIT=y CONFIG_NET=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6acda1a95807..fb165b7fc01f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -55,6 +55,8 @@ enum memcg_memory_event { MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, + MEMCG_PAGECACHE_MAX, + MEMCG_PAGECACHE_OOM, MEMCG_NR_MEMORY_EVENTS, }; @@ -237,6 +239,10 @@ struct mem_cgroup { struct page_counter memsw; /* v1 only */ }; + struct page_counter pagecache; + u64 pagecache_reclaim_ratio; + u32 pagecache_max_ratio; + /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ @@ -403,6 +409,21 @@ struct mem_cgroup { */ #define MEMCG_CHARGE_BATCH 64U +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { @@ -1841,6 +1862,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg); void free_shrinker_info(struct mem_cgroup *memcg); void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); void reparent_shrinker_deferred(struct mem_cgroup *memcg); + +extern int sysctl_vm_memory_qos; +extern unsigned int vm_pagecache_limit_retry_times; +extern void +mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask); + #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ab397f104240..758fa26cdd07 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -847,7 +847,6 @@ struct zone { */ long lowmem_reserve[MAX_NR_ZONES]; -#ifdef CONFIG_PAGECACHE_LIMIT /* * This atomic counter is set when there is pagecache limit * reclaim going on on this particular zone. Other potential @@ -855,7 +854,6 @@ struct zone { * bouncing. */ atomic_t pagecache_reclaim; -#endif #ifdef CONFIG_NUMA int node; diff --git a/include/linux/swap.h b/include/linux/swap.h index fd45d18b416c..556339b5b9f3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -407,15 +407,19 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); -#ifdef CONFIG_PAGECACHE_LIMIT +#define ADDITIONAL_RECLAIM_RATIO 2 extern int vm_pagecache_limit_ratio; extern int vm_pagecache_limit_reclaim_ratio; extern unsigned long vm_pagecache_limit_pages; extern unsigned long vm_pagecache_limit_reclaim_pages; extern unsigned int vm_pagecache_ignore_dirty; extern unsigned int vm_pagecache_limit_async; +extern unsigned int vm_pagecache_limit_global; extern unsigned int vm_pagecache_ignore_slab; +extern long shrink_page_cache_memcg(gfp_t mask, struct mem_cgroup *memcg, + unsigned long nr_pages); +extern unsigned long __pagecache_over_limit(void); extern unsigned long pagecache_over_limit(void); extern int kpagecache_limitd_run(void); extern void kpagecache_limitd_stop(void); @@ -424,15 +428,6 @@ static inline bool pagecache_limit_should_shrink(void) { return unlikely(vm_pagecache_limit_pages) && pagecache_over_limit(); } -#else -extern inline void shrink_page_cache(gfp_t mask, struct page *page) -{ -} -static inline bool pagecache_limit_should_shrink(void) -{ - return 0; -} -#endif #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) #define MEMCG_RECLAIM_PROACTIVE (1 << 2) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 04bf387f126e..bfda2c9871f9 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -87,6 +87,8 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); int netcls_do_large_bitmap(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); +int proc_pagecache_system_usage(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); int proc_do_static_key(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1589b6dd1818..e118fa269bbb 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -592,8 +592,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, buffer, lenp, ppos, conv, data); } -#ifdef CONFIG_PAGECACHE_LIMIT -#define ADDITIONAL_RECLAIM_RATIO 2 static int setup_pagecache_limit(void) { /* reclaim ADDITIONAL_RECLAIM_PAGES more than limit. */ @@ -661,7 +659,6 @@ static int pc_limit_async_handler(struct ctl_table *table, int write, return ret; } -#endif /* CONFIG_PAGECACHE_LIMIT */ static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, @@ -2609,6 +2606,8 @@ static struct ctl_table kern_table[] = { { } }; +unsigned long vm_pagecache_system_usage; + static struct ctl_table vm_table[] = { { .procname = "overcommit_memory", @@ -2852,7 +2851,6 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif -#ifdef CONFIG_PAGECACHE_LIMIT { .procname = "pagecache_limit_ratio", .data = &vm_pagecache_limit_ratio, @@ -2892,8 +2890,32 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#endif /* CONFIG_PAGECACHE_LIMIT */ #ifdef CONFIG_MEMCG + { + .procname = "pagecache_limit_global", + .data = &vm_pagecache_limit_global, + .maxlen = sizeof(vm_pagecache_limit_global), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "pagecache_limit_retry_times", + .data = &vm_pagecache_limit_retry_times, + .maxlen = sizeof(vm_pagecache_limit_retry_times), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_MAXOLDUID, + }, + { + .procname = "pagecache_system_usage", + .data = &vm_pagecache_system_usage, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = proc_pagecache_system_usage, + }, { .procname = "memory_qos", .data = &sysctl_vm_memory_qos, diff --git a/mm/Kconfig b/mm/Kconfig index 9a5cbcc84873..5ba91156e46f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1271,14 +1271,6 @@ config LOCK_MM_AND_FIND_VMA source "mm/damon/Kconfig" -config PAGECACHE_LIMIT - bool "Page cache limit" - help - This options allow user to set a limit for the page cache. - For details, see Documentation/mm/pagecache-limit. - - If unsure, say N. - config ENHANCED_MM bool "Enable enhanced mm support (EMM)" depends on MEMCG diff --git a/mm/filemap.c b/mm/filemap.c index ca878e126e05..7c2078ec803b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -848,6 +848,9 @@ noinline int __filemap_add_folio(struct address_space *mapping, int huge = folio_test_hugetlb(folio); bool charged = false; long nr = 1; +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg; +#endif VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio); @@ -861,6 +864,14 @@ noinline int __filemap_add_folio(struct address_space *mapping, charged = true; xas_set_order(&xas, index, folio_order(folio)); nr = folio_nr_pages(folio); + +#ifdef CONFIG_MEMCG + /* For a successful charge, folio->memcg_data must be set. */ + memcg = folio_memcg(folio); + + for (; memcg; memcg = parent_mem_cgroup(memcg)) + mem_cgroup_shrink_pagecache(memcg, gfp); +#endif } gfp &= GFP_RECLAIM_MASK; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c74899005708..8486580b23c1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -112,6 +112,11 @@ static bool cgroup_memory_nobpf __ro_after_init; static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif +#define MEMCG_PAGECACHE_RETRIES 20 +#define DEFAULT_PAGE_RECLAIM_RATIO 5 +#define PAGECACHE_MAX_RATIO_MIN 5 +#define PAGECACHE_MAX_RATIO_MAX 100 + int sysctl_vm_memory_qos; /* default has none reclaim priority */ int sysctl_vm_qos_highest_reclaim_prio = CGROUP_PRIORITY_MAX; @@ -254,21 +259,6 @@ enum res_type { #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) -/* - * Iteration constructs for visiting all cgroups (under a tree). If - * loops are exited prematurely (break), mem_cgroup_iter_break() must - * be used for reference counting. - */ -#define for_each_mem_cgroup_tree(iter, root) \ - for (iter = mem_cgroup_iter(root, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(root, iter, NULL)) - -#define for_each_mem_cgroup(iter) \ - for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ - iter != NULL; \ - iter = mem_cgroup_iter(NULL, iter, NULL)) - static inline bool task_is_dying(void) { return tsk_is_oom_victim(current) || fatal_signal_pending(current) || @@ -890,6 +880,13 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); memcg_rstat_updated(memcg, val); + + if (idx == NR_FILE_PAGES) { + if (val > 0) + page_counter_charge(&memcg->pagecache, val); + else + page_counter_uncharge(&memcg->pagecache, -val); + } memcg_stats_unlock(); } @@ -4005,6 +4002,8 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif +static void pagecache_set_limit(struct mem_cgroup *memcg); + static DEFINE_MUTEX(memcg_max_mutex); static int mem_cgroup_resize_max(struct mem_cgroup *memcg, @@ -4062,6 +4061,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, if (enlarge) memcg_oom_recover(memcg); + pagecache_set_limit(memcg); } return ret; @@ -4206,6 +4206,134 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return -EINVAL; } +#define MIN_PAGECACHE_PAGES 16 +unsigned int +vm_pagecache_limit_retry_times __read_mostly = MEMCG_PAGECACHE_RETRIES; + +void mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask) +{ + long pages_reclaimed; + unsigned long pages_used, pages_max, goal_pages_used, pre_used; + unsigned int retry_times = 0; + unsigned int limit_retry_times; + u32 max_ratio; + + if (!sysctl_vm_memory_qos || vm_pagecache_limit_global) + return; + + if (!memcg || mem_cgroup_is_root(memcg)) + return; + + max_ratio = READ_ONCE(memcg->pagecache_max_ratio); + if (max_ratio == PAGECACHE_MAX_RATIO_MAX) + return; + + pages_max = READ_ONCE(memcg->pagecache.max); + if (pages_max == PAGE_COUNTER_MAX) + return; + + if (unlikely(task_is_dying())) + return; + + if (unlikely(current->flags & PF_MEMALLOC)) + return; + + if (unlikely(task_in_memcg_oom(current))) + return; + + if (!gfpflags_allow_blocking(gfp_mask)) + return; + + pages_used = page_counter_read(&memcg->pagecache); + limit_retry_times = READ_ONCE(vm_pagecache_limit_retry_times); + goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio)) + * pages_max / 100; + goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES, + goal_pages_used); + + if (pages_used > pages_max) + memcg_memory_event(memcg, MEMCG_PAGECACHE_MAX); + + while (pages_used > goal_pages_used) { + if (fatal_signal_pending(current)) + break; + + pre_used = pages_used; + pages_reclaimed = shrink_page_cache_memcg(gfp_mask, memcg, + pages_used - goal_pages_used); + + if (pages_reclaimed == -EINVAL) + return; + + if (limit_retry_times == 0) + goto next_shrink; + + if (pages_reclaimed == 0) { + io_schedule_timeout(HZ/10); + retry_times++; + } else + retry_times = 0; + + if (retry_times > limit_retry_times) { + pr_warn("Attempts to recycle many times have not recovered enough pages.\n"); + break; + } + +next_shrink: + pages_used = page_counter_read(&memcg->pagecache); + cond_resched(); + } +} + +static u64 pagecache_reclaim_ratio_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return memcg->pagecache_reclaim_ratio; +} + +static ssize_t pagecache_reclaim_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + u64 reclaim_ratio; + int ret; + unsigned long nr_pages; + + if (!sysctl_vm_memory_qos) { + pr_warn("you should open vm.memory_qos.\n"); + return -EINVAL; + } + + if (vm_pagecache_limit_global) { + pr_warn("you should clear vm_pagecache_limit_global.\n"); + return -EINVAL; + } + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtou64(buf, 0, &reclaim_ratio); + if (ret) + return ret; + + if ((reclaim_ratio > 0) && (reclaim_ratio < 100)) { + memcg->pagecache_reclaim_ratio = reclaim_ratio; + mem_cgroup_shrink_pagecache(memcg, GFP_KERNEL); + return nbytes; + } else if (reclaim_ratio == 100) { + nr_pages = page_counter_read(&memcg->pagecache); + + //try reclaim once + shrink_page_cache_memcg(GFP_KERNEL, memcg, nr_pages); + return nbytes; + } + + return -EINVAL; +} + static u64 mem_cgroup_priority_oom_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -4226,6 +4354,134 @@ static int mem_cgroup_priority_oom_write(struct cgroup_subsys_state *css, return 0; } +static u64 pagecache_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return (u64)page_counter_read(&memcg->pagecache) * PAGE_SIZE; +} + +static u64 memory_pagecache_max_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return memcg->pagecache_max_ratio; +} + +unsigned long mem_cgroup_pagecache_get_reclaim_pages(struct mem_cgroup *memcg) +{ + unsigned long goal_pages_used, pages_used, pages_max; + + if ((!memcg) || (mem_cgroup_is_root(memcg))) + return 0; + + pages_max = READ_ONCE(memcg->pagecache.max); + if (pages_max == PAGE_COUNTER_MAX) + return 0; + + goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio)) + * pages_max / 100; + goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES, + goal_pages_used); + pages_used = page_counter_read(&memcg->pagecache); + + return pages_used > pages_max ? pages_used - goal_pages_used : 0; +} + +static void pagecache_set_limit(struct mem_cgroup *memcg) +{ + unsigned long max, pages_max; + u32 max_ratio; + + pages_max = READ_ONCE(memcg->memory.max); + max_ratio = READ_ONCE(memcg->pagecache_max_ratio); + max = ((pages_max * max_ratio) / 100); + xchg(&memcg->pagecache.max, max); +} + +static ssize_t memory_pagecache_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int nr_reclaims = vm_pagecache_limit_retry_times; + unsigned long max; + long pages_reclaimed; + int ret = 0; + u64 max_ratio, old; + + if (!sysctl_vm_memory_qos) { + pr_warn("you should open vm.memory_qos.\n"); + return -EINVAL; + } + + if (vm_pagecache_limit_global) { + pr_warn("you should clear vm_pagecache_limit_global.\n"); + return -EINVAL; + } + + if (!buf) + return -EINVAL; + + ret = kstrtou64(buf, 0, &max_ratio); + if (ret) + return ret; + + if (max_ratio > PAGECACHE_MAX_RATIO_MAX || + max_ratio < PAGECACHE_MAX_RATIO_MIN) + return -EINVAL; + + if (READ_ONCE(memcg->memory.max) == PAGE_COUNTER_MAX) { + pr_warn("pagecache limit not allowed for cgroup without memory limit set\n"); + return -EPERM; + } + + old = READ_ONCE(memcg->pagecache_max_ratio); + memcg->pagecache_max_ratio = max_ratio; + pagecache_set_limit(memcg); + max = READ_ONCE(memcg->pagecache.max); + + for (;;) { + unsigned long pages_used = page_counter_read(&memcg->pagecache); + + if (pages_used <= max) + break; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + if (nr_reclaims) { + pages_reclaimed = + shrink_page_cache_memcg(GFP_KERNEL, memcg, + mem_cgroup_pagecache_get_reclaim_pages(memcg)); + + if (pages_reclaimed == -EINVAL) { + pr_warn("you should clear vm_pagecache_limit_global.\n"); + return -EINVAL; + } + + if (pages_reclaimed == 0) { + io_schedule_timeout(HZ/10); + nr_reclaims--; + cond_resched(); + } else + nr_reclaims = vm_pagecache_limit_retry_times; + + continue; + } + + memcg->pagecache_max_ratio = old; + pagecache_set_limit(memcg); + pr_warn("Attempts to recycle many times have not recovered enough pages.\n"); + return -EINVAL; + } + + return ret ? : nbytes; +} + static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { unsigned long val; @@ -6129,6 +6385,23 @@ static struct cftype mem_cgroup_legacy_files[] = { .write_u64 = mem_cgroup_hierarchy_write, .read_u64 = mem_cgroup_hierarchy_read, }, + { + .name = "pagecache.reclaim_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = pagecache_reclaim_ratio_read, + .write = pagecache_reclaim_ratio_write, + }, + { + .name = "pagecache.max_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = memory_pagecache_max_read, + .write = memory_pagecache_max_write, + }, + { + .name = "pagecache.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = pagecache_current_read, + }, { .name = "use_priority_oom", .write_u64 = mem_cgroup_priority_oom_write, @@ -6589,6 +6862,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); + memcg->pagecache_reclaim_ratio = DEFAULT_PAGE_RECLAIM_RATIO; + memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) memcg->zswap_max = PAGE_COUNTER_MAX; #endif @@ -6619,12 +6894,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); + page_counter_init(&memcg->pagecache, &parent->pagecache); } else { init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->tcpmem, NULL); + page_counter_init(&memcg->pagecache, NULL); } setup_async_wmark(memcg); @@ -6791,6 +7068,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->pagecache, PAGE_COUNTER_MAX); page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX); @@ -7857,6 +8135,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, setup_async_wmark(memcg); if (need_memcg_async_reclaim(memcg)) queue_work(memcg_async_reclaim_wq, &memcg->async_work); + pagecache_set_limit(memcg); memcg_wb_domain_size_changed(memcg); return nbytes; @@ -7872,6 +8151,10 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events) atomic_long_read(&events[MEMCG_OOM_KILL])); seq_printf(m, "oom_group_kill %lu\n", atomic_long_read(&events[MEMCG_OOM_GROUP_KILL])); + seq_printf(m, "pagecache_max %lu\n", + atomic_long_read(&events[MEMCG_PAGECACHE_MAX])); + seq_printf(m, "pagecache_oom %lu\n", + atomic_long_read(&events[MEMCG_PAGECACHE_OOM])); } static int memory_events_show(struct seq_file *m, void *v) @@ -8114,6 +8397,23 @@ static ssize_t memory_async_wmark_delta_write(struct kernfs_open_file *of, } static struct cftype memory_files[] = { + { + .name = "pagecache.reclaim_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = pagecache_reclaim_ratio_read, + .write = pagecache_reclaim_ratio_write, + }, + { + .name = "pagecache.max_ratio", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = memory_pagecache_max_read, + .write = memory_pagecache_max_write, + }, + { + .name = "pagecache.current", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = pagecache_current_read, + }, { .name = "current", .flags = CFTYPE_NOT_ON_ROOT, diff --git a/mm/vmscan.c b/mm/vmscan.c index dd7061c8b495..688bd7474141 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -68,6 +68,7 @@ #include #include #include +#include #include "internal.h" #include "swap.h" @@ -505,6 +506,24 @@ static bool writeback_throttling_sane(struct scan_control *sc) return false; } #else + +#define sysctl_vm_memory_qos 0 + +/* + * Iteration constructs for visiting all cgroups (under a tree). If + * loops are exited prematurely (break), mem_cgroup_iter_break() must + * be used for reference counting. + */ +#define for_each_mem_cgroup_tree(iter, root) \ + for (iter = mem_cgroup_iter(root, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(root, iter, NULL)) + +#define for_each_mem_cgroup(iter) \ + for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ + iter != NULL; \ + iter = mem_cgroup_iter(NULL, iter, NULL)) + static int prealloc_memcg_shrinker(struct shrinker *shrinker) { return -ENOSYS; @@ -7400,69 +7419,97 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages); #endif -#ifdef CONFIG_PAGECACHE_LIMIT int vm_pagecache_limit_ratio __read_mostly; int vm_pagecache_limit_reclaim_ratio __read_mostly; unsigned long vm_pagecache_limit_pages __read_mostly; unsigned long vm_pagecache_limit_reclaim_pages __read_mostly; unsigned int vm_pagecache_ignore_dirty __read_mostly = 1; unsigned int vm_pagecache_limit_async __read_mostly; +unsigned int vm_pagecache_limit_global __read_mostly; unsigned int vm_pagecache_ignore_slab __read_mostly = 1; static struct task_struct *kpclimitd; static bool kpclimitd_context; +extern unsigned long vm_pagecache_system_usage; + +unsigned long __pagecache_over_limit(void) +{ + unsigned long pgcache_lru_pages = 0; + /* + * We only want to limit unmapped and non-shmem page cache pages, + * normally all shmem pages are mapped as well. + */ + unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES) + - max_t(unsigned long, + global_node_page_state(NR_FILE_MAPPED), + global_node_page_state(NR_SHMEM)); + + /* + * We certainly can't free more than what's on the LRU lists + * minus the dirty ones. + */ + if (vm_pagecache_ignore_slab) + pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE); + else + pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) + + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B); + + if (vm_pagecache_ignore_dirty != 0) + pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY) / + vm_pagecache_ignore_dirty; + + /* Paranoia */ + if (unlikely(pgcache_lru_pages > LONG_MAX)) + return 0; + + /* Limit it to 94% of LRU (not all there might be unmapped). */ + pgcache_lru_pages -= pgcache_lru_pages / 16; + if (vm_pagecache_ignore_slab) + pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages); + else + pgcache_pages = pgcache_lru_pages; + + return pgcache_pages; +} + +int proc_pagecache_system_usage(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + vm_pagecache_system_usage = __pagecache_over_limit(); + + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); +} + /* * Returns a number that's positive if the pagecache is above - * the set limit + * the set limit. */ unsigned long pagecache_over_limit(void) { unsigned long should_reclaim_pages = 0; unsigned long overlimit_pages = 0; unsigned long delta_pages = 0; - unsigned long pgcache_lru_pages = 0; - /* We only want to limit unmapped and non-shmem page cache pages; - * normally all shmem pages are mapped as well*/ - unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES) - - max_t(unsigned long, - global_node_page_state(NR_FILE_MAPPED), - global_node_page_state(NR_SHMEM)); - /* We certainly can't free more than what's on the LRU lists - * minus the dirty ones*/ - if (vm_pagecache_ignore_slab) - pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE) - + global_node_page_state(NR_INACTIVE_FILE); - else - pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE) - + global_node_page_state(NR_INACTIVE_FILE) - + global_node_page_state(NR_SLAB_RECLAIMABLE_B) - + global_node_page_state(NR_SLAB_UNRECLAIMABLE_B); + unsigned long pgcache_pages = 0; - if (vm_pagecache_ignore_dirty != 0) - pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY) - /vm_pagecache_ignore_dirty; - /* Paranoia */ - if (unlikely(pgcache_lru_pages > LONG_MAX)) - return 0; - - /* Limit it to 94% of LRU (not all there might be unmapped) */ - pgcache_lru_pages -= pgcache_lru_pages/16; - if (vm_pagecache_ignore_slab) - pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages); - else - pgcache_pages = pgcache_lru_pages; + pgcache_pages = __pagecache_over_limit(); /* - *delta_pages: we should reclaim at least 2% more pages than overlimit_page, values get from - * /proc/vm/pagecache_limit_reclaim_pages - *should_reclaim_pages: the real pages we will reclaim, but it should less than pgcache_pages; - */ + * delta_pages: we should reclaim at least 2% more pages than overlimit_page, + * values get from /proc/vm/pagecache_limit_reclaim_pages. + * should_reclaim_pages: the real pages we will reclaim, + * but it should less than pgcache_pages. + */ if (pgcache_pages > vm_pagecache_limit_pages) { overlimit_pages = pgcache_pages - vm_pagecache_limit_pages; delta_pages = vm_pagecache_limit_reclaim_pages - vm_pagecache_limit_pages; - should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages) + overlimit_pages; + should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages) + + overlimit_pages; return should_reclaim_pages; } + return 0; } @@ -7648,7 +7695,8 @@ out: * This function is similar to shrink_all_memory, except that it may never * swap out mapped pages and only does four passes. */ -static void __shrink_page_cache(gfp_t mask) +static unsigned long __shrink_page_cache(gfp_t mask, struct mem_cgroup *memcg, + unsigned long nr_pages) { unsigned long ret = 0; int pass = 0; @@ -7660,11 +7708,10 @@ static void __shrink_page_cache(gfp_t mask) .may_unmap = 0, .may_writepage = 0, .may_deactivate = DEACTIVATE_FILE, - .target_mem_cgroup = NULL, + .target_mem_cgroup = memcg, .reclaim_idx = MAX_NR_ZONES, }; struct reclaim_state *old_rs = current->reclaim_state; - long nr_pages; /* We might sleep during direct reclaim so make atomic context * is certainly a bug. @@ -7672,9 +7719,6 @@ static void __shrink_page_cache(gfp_t mask) BUG_ON(!(mask & __GFP_RECLAIM)); retry: - /* How many pages are we over the limit?*/ - nr_pages = pagecache_over_limit(); - /* * Return early if there's no work to do. * Wake up reclaimers that couldn't scan any zone due to congestion. @@ -7682,7 +7726,7 @@ retry: * This makes sure that no sleeping reclaimer will stay behind. * Allow breaching the limit if the task is on the way out. */ - if (nr_pages <= 0 || fatal_signal_pending(current)) { + if (nr_pages == 0 || fatal_signal_pending(current)) { wake_up_interruptible(&pagecache_reclaim_wq); goto out; } @@ -7719,9 +7763,10 @@ retry: goto out; for_each_online_node(nid) { - struct mem_cgroup *memcg = NULL; - while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL) - shrink_slab(mask, nid, memcg, sc.priority); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, memcg) + shrink_slab(mask, nid, iter, sc.priority); } ret += reclaim_state.reclaimed; reclaim_state.reclaimed = 0; @@ -7741,8 +7786,11 @@ retry: out: current->reclaim_state = old_rs; + return sc.nr_reclaimed; } +void batch_shrink_page_cache(gfp_t mask); + static int kpagecache_limitd(void *data) { DEFINE_WAIT(wait); @@ -7755,7 +7803,9 @@ static int kpagecache_limitd(void *data) wake_up_interruptible(&pagecache_reclaim_wq); for (;;) { - __shrink_page_cache(GFP_KERNEL); + if (pagecache_limit_should_shrink()) + batch_shrink_page_cache(GFP_KERNEL); + prepare_to_wait(&kpagecache_limitd_wq, &wait, TASK_INTERRUPTIBLE); if (!kthread_should_stop()) @@ -7777,14 +7827,66 @@ static void wakeup_kpclimitd(gfp_t mask) wake_up_interruptible(&kpagecache_limitd_wq); } +void batch_shrink_page_cache(gfp_t mask) +{ + int reclaim_ratio, goal, retry_limit = 10, retry = 0; + unsigned long goals, currents, batchs, reclaims, reclaimed; + int tmp_reclaim_ratio = vm_pagecache_limit_reclaim_ratio; + int tmp_limit_ratio = vm_pagecache_limit_ratio; + + reclaim_ratio = max_t(int, tmp_reclaim_ratio - tmp_limit_ratio, + ADDITIONAL_RECLAIM_RATIO); + goal = tmp_limit_ratio - reclaim_ratio; + if (goal <= 0) + return; + + reclaims = reclaim_ratio * totalram_pages() / 100; + if (vm_pagecache_limit_async == 0) + batchs = reclaims / num_online_cpus(); + else + batchs = reclaims; + goals = goal * totalram_pages() / 100; + currents = __pagecache_over_limit(); + + while (currents > goals) { + if (fatal_signal_pending(current)) + break; + + reclaimed = __shrink_page_cache(mask, NULL, batchs); + if (reclaimed == 0) { + io_schedule_timeout(HZ/10); + retry++; + } else + retry = 0; + + if (retry > retry_limit) + break; + + currents = __pagecache_over_limit(); + cond_resched(); + } +} + void shrink_page_cache(gfp_t mask, struct page *page) { - if (0 == vm_pagecache_limit_async) - __shrink_page_cache(mask); + if (!sysctl_vm_memory_qos || !vm_pagecache_limit_global) + return; + + if (vm_pagecache_limit_async == 0) + batch_shrink_page_cache(mask); else wakeup_kpclimitd(mask); } +long shrink_page_cache_memcg(gfp_t mask, struct mem_cgroup *memcg, + unsigned long nr_pages) +{ + if (!vm_pagecache_limit_global) + return __shrink_page_cache(mask, memcg, nr_pages); + + return -EINVAL; +} + int kpagecache_limitd_run(void) { int ret = 0; @@ -7809,7 +7911,6 @@ void kpagecache_limitd_stop(void) kpclimitd = NULL; } } -#endif /* CONFIG_PAGECACHE_LIMIT */ static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { @@ -8053,6 +8154,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) .order = order, .may_unmap = 1, }; + unsigned long nr_pages; set_task_reclaim_state(current, &sc.reclaim_state); psi_memstall_enter(&pflags); @@ -8060,11 +8162,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx) count_vm_event(PAGEOUTRUN); -#ifdef CONFIG_PAGECACHE_LIMIT /* This reclaims from all zones so don't count to sc.nr_reclaimed */ - if (pagecache_limit_should_shrink()) - __shrink_page_cache(GFP_KERNEL); -#endif /* CONFIG_PAGECACHE_LIMIT */ + if (pagecache_limit_should_shrink()) { + nr_pages = pagecache_over_limit(); + if (nr_pages) + shrink_page_cache(GFP_KERNEL, NULL); + } /* * Account for the reclaim boost. Note that the zone boost is left in