rue/mm: pagecache limit per cgroup support

Functional test: http://tapd.oa.com/TencentOS_QoS/prong/stories/view/ 1020426664867405667?jump_count=1 Signed-off-by: Xiaoguang Chen <xiaoggchen@tencent.com> Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com> Signed-off-by: Xuan Liu <benxliu@tencent.com> Signed-off-by: Honglin Li <honglinli@tencent.com>
2023-09-01 15:22:09 +08:00 · 2023-09-01 15:22:09 +08:00 · 75ad2bae3d
parent 56d80c4ea2
commit 75ad2bae3d
10 changed files with 545 additions and 96 deletions
--- a/arch/riscv/configs/tencent.config
+++ b/arch/riscv/configs/tencent.config
@ -140,7 +140,6 @@ CONFIG_DAMON_SYSFS=y
 CONFIG_DAMON_DBGFS=y
 CONFIG_DAMON_RECLAIM=y
 CONFIG_DAMON_LRU_SORT=y
-CONFIG_PAGECACHE_LIMIT=y
 CONFIG_NET=y
 CONFIG_PACKET=y
 CONFIG_PACKET_DIAG=m
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@ -55,6 +55,8 @@ enum memcg_memory_event {
 	MEMCG_SWAP_HIGH,
 	MEMCG_SWAP_MAX,
 	MEMCG_SWAP_FAIL,
+	MEMCG_PAGECACHE_MAX,
+	MEMCG_PAGECACHE_OOM,
 	MEMCG_NR_MEMORY_EVENTS,
 };

@ -237,6 +239,10 @@ struct mem_cgroup {
 		struct page_counter memsw;	/* v1 only */
 	};

+	struct page_counter pagecache;
+	u64 pagecache_reclaim_ratio;
+	u32 pagecache_max_ratio;
+
 	/* Legacy consumer-oriented counters */
 	struct page_counter kmem;		/* v1 only */
 	struct page_counter tcpmem;		/* v1 only */
@ -403,6 +409,21 @@ struct mem_cgroup {
 */
 #define MEMCG_CHARGE_BATCH 64U

+/*
+ * Iteration constructs for visiting all cgroups (under a tree).  If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root)		\
+	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter)			\
+	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(NULL, iter, NULL))
+
 extern struct mem_cgroup *root_mem_cgroup;

 enum page_memcg_data_flags {
@ -1841,6 +1862,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
 void free_shrinker_info(struct mem_cgroup *memcg);
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
 void reparent_shrinker_deferred(struct mem_cgroup *memcg);
+
+extern int sysctl_vm_memory_qos;
+extern unsigned int vm_pagecache_limit_retry_times;
+extern void
+mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask);
+
 #else
 #define mem_cgroup_sockets_enabled 0
 static inline void mem_cgroup_sk_alloc(struct sock *sk) { };
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@ -847,7 +847,6 @@ struct zone {
 	 */
 	long lowmem_reserve[MAX_NR_ZONES];

-#ifdef CONFIG_PAGECACHE_LIMIT
 	/*
 	 * This atomic counter is set when there is pagecache limit
 	 * reclaim going on on this particular zone. Other potential
@ -855,7 +854,6 @@ struct zone {
 	 * bouncing.
 	 */
 	atomic_t		pagecache_reclaim;
-#endif

 #ifdef CONFIG_NUMA
 	int node;
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@ -407,15 +407,19 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone);
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);

-#ifdef CONFIG_PAGECACHE_LIMIT
+#define ADDITIONAL_RECLAIM_RATIO 2
 extern int vm_pagecache_limit_ratio;
 extern int vm_pagecache_limit_reclaim_ratio;
 extern unsigned long vm_pagecache_limit_pages;
 extern unsigned long vm_pagecache_limit_reclaim_pages;
 extern unsigned int vm_pagecache_ignore_dirty;
 extern unsigned int vm_pagecache_limit_async;
+extern unsigned int vm_pagecache_limit_global;
 extern unsigned int vm_pagecache_ignore_slab;

+extern long shrink_page_cache_memcg(gfp_t mask, struct mem_cgroup *memcg,
+				    unsigned long nr_pages);
+extern unsigned long __pagecache_over_limit(void);
 extern unsigned long pagecache_over_limit(void);
 extern int kpagecache_limitd_run(void);
 extern void kpagecache_limitd_stop(void);
@ -424,15 +428,6 @@ static inline bool pagecache_limit_should_shrink(void)
 {
 	return unlikely(vm_pagecache_limit_pages) && pagecache_over_limit();
 }
-#else
-extern inline void shrink_page_cache(gfp_t mask, struct page *page)
-{
-}
-static inline bool pagecache_limit_should_shrink(void)
-{
-	return 0;
-}
-#endif

 #define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
 #define MEMCG_RECLAIM_PROACTIVE (1 << 2)
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@ -87,6 +87,8 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *,
 int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *);
 int netcls_do_large_bitmap(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);
+int proc_pagecache_system_usage(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp, loff_t *ppos);
 int proc_do_static_key(struct ctl_table *table, int write, void *buffer,
 		size_t *lenp, loff_t *ppos);

--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@ -592,8 +592,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
 			buffer, lenp, ppos, conv, data);
 }

-#ifdef CONFIG_PAGECACHE_LIMIT
-#define ADDITIONAL_RECLAIM_RATIO 2
 static int setup_pagecache_limit(void)
 {
 	/* reclaim ADDITIONAL_RECLAIM_PAGES more than limit. */
@ -661,7 +659,6 @@ static int pc_limit_async_handler(struct ctl_table *table, int write,

 	return ret;
 }
-#endif /* CONFIG_PAGECACHE_LIMIT */

 static int do_proc_douintvec_w(unsigned int *tbl_data,
 			       struct ctl_table *table,
@ -2609,6 +2606,8 @@ static struct ctl_table kern_table[] = {
 	{ }
 };

+unsigned long vm_pagecache_system_usage;
+
 static struct ctl_table vm_table[] = {
 	{
 		.procname	= "overcommit_memory",
@ -2852,7 +2851,6 @@ static struct ctl_table vm_table[] = {
 		.extra2		= (void *)&mmap_rnd_compat_bits_max,
 	},
 #endif
-#ifdef CONFIG_PAGECACHE_LIMIT
 	{
 		.procname	= "pagecache_limit_ratio",
 		.data		= &vm_pagecache_limit_ratio,
@ -2892,8 +2890,32 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-#endif /* CONFIG_PAGECACHE_LIMIT */
 #ifdef CONFIG_MEMCG
+	{
+		.procname	= "pagecache_limit_global",
+		.data		= &vm_pagecache_limit_global,
+		.maxlen		= sizeof(vm_pagecache_limit_global),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "pagecache_limit_retry_times",
+		.data		= &vm_pagecache_limit_retry_times,
+		.maxlen		= sizeof(vm_pagecache_limit_retry_times),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_MAXOLDUID,
+	},
+	{
+		.procname	= "pagecache_system_usage",
+		.data		= &vm_pagecache_system_usage,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444,
+		.proc_handler	= proc_pagecache_system_usage,
+	},
 	{
 		.procname		= "memory_qos",
 		.data			= &sysctl_vm_memory_qos,
--- a/mm/Kconfig
+++ b/mm/Kconfig
@ -1271,14 +1271,6 @@ config LOCK_MM_AND_FIND_VMA

 source "mm/damon/Kconfig"

-config PAGECACHE_LIMIT
-	bool "Page cache limit"
-	help
-	  This options allow user to set a limit for the page cache.
-	  For details, see Documentation/mm/pagecache-limit.
-
-	  If unsure, say N.
-
 config ENHANCED_MM
 	bool "Enable enhanced mm support (EMM)"
 	depends on MEMCG
--- a/mm/filemap.c
+++ b/mm/filemap.c
@ -848,6 +848,9 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 	int huge = folio_test_hugetlb(folio);
 	bool charged = false;
 	long nr = 1;
+#ifdef CONFIG_MEMCG
+	struct mem_cgroup *memcg;
+#endif

 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 	VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
@ -861,6 +864,14 @@ noinline int __filemap_add_folio(struct address_space *mapping,
 		charged = true;
 		xas_set_order(&xas, index, folio_order(folio));
 		nr = folio_nr_pages(folio);
+
+#ifdef CONFIG_MEMCG
+		/* For a successful charge, folio->memcg_data must be set. */
+		memcg = folio_memcg(folio);
+
+		for (; memcg; memcg = parent_mem_cgroup(memcg))
+			mem_cgroup_shrink_pagecache(memcg, gfp);
+#endif
 	}

 	gfp &= GFP_RECLAIM_MASK;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@ -112,6 +112,11 @@ static bool cgroup_memory_nobpf __ro_after_init;
 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
 #endif

+#define MEMCG_PAGECACHE_RETRIES		20
+#define DEFAULT_PAGE_RECLAIM_RATIO	5
+#define PAGECACHE_MAX_RATIO_MIN		5
+#define PAGECACHE_MAX_RATIO_MAX		100
+
 int sysctl_vm_memory_qos;
 /* default has none reclaim priority */
 int sysctl_vm_qos_highest_reclaim_prio = CGROUP_PRIORITY_MAX;
@ -254,21 +259,6 @@ enum res_type {
 #define MEMFILE_TYPE(val)	((val) >> 16 & 0xffff)
 #define MEMFILE_ATTR(val)	((val) & 0xffff)

-/*
- * Iteration constructs for visiting all cgroups (under a tree).  If
- * loops are exited prematurely (break), mem_cgroup_iter_break() must
- * be used for reference counting.
- */
-#define for_each_mem_cgroup_tree(iter, root)		\
-	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
-	     iter != NULL;				\
-	     iter = mem_cgroup_iter(root, iter, NULL))
-
-#define for_each_mem_cgroup(iter)			\
-	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
-	     iter != NULL;				\
-	     iter = mem_cgroup_iter(NULL, iter, NULL))
-
 static inline bool task_is_dying(void)
 {
 	return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@ -890,6 +880,13 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 	__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);

 	memcg_rstat_updated(memcg, val);
+
+	if (idx == NR_FILE_PAGES) {
+		if (val > 0)
+			page_counter_charge(&memcg->pagecache, val);
+		else
+			page_counter_uncharge(&memcg->pagecache, -val);
+	}
 	memcg_stats_unlock();
 }

@ -4005,6 +4002,8 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 }
 #endif

+static void pagecache_set_limit(struct mem_cgroup *memcg);
+
 static DEFINE_MUTEX(memcg_max_mutex);

 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
@ -4062,6 +4061,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,

 		if (enlarge)
 			memcg_oom_recover(memcg);
+		pagecache_set_limit(memcg);
 	}

 	return ret;
@ -4206,6 +4206,134 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
 	return -EINVAL;
 }

+#define MIN_PAGECACHE_PAGES 16
+unsigned int
+vm_pagecache_limit_retry_times __read_mostly = MEMCG_PAGECACHE_RETRIES;
+
+void mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask)
+{
+	long pages_reclaimed;
+	unsigned long pages_used, pages_max, goal_pages_used, pre_used;
+	unsigned int retry_times = 0;
+	unsigned int limit_retry_times;
+	u32 max_ratio;
+
+	if (!sysctl_vm_memory_qos || vm_pagecache_limit_global)
+		return;
+
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return;
+
+	max_ratio = READ_ONCE(memcg->pagecache_max_ratio);
+	if (max_ratio == PAGECACHE_MAX_RATIO_MAX)
+		return;
+
+	pages_max = READ_ONCE(memcg->pagecache.max);
+	if (pages_max == PAGE_COUNTER_MAX)
+		return;
+
+	if (unlikely(task_is_dying()))
+		return;
+
+	if (unlikely(current->flags & PF_MEMALLOC))
+		return;
+
+	if (unlikely(task_in_memcg_oom(current)))
+		return;
+
+	if (!gfpflags_allow_blocking(gfp_mask))
+		return;
+
+	pages_used = page_counter_read(&memcg->pagecache);
+	limit_retry_times = READ_ONCE(vm_pagecache_limit_retry_times);
+	goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio))
+				* pages_max / 100;
+	goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES,
+				goal_pages_used);
+
+	if (pages_used > pages_max)
+		memcg_memory_event(memcg, MEMCG_PAGECACHE_MAX);
+
+	while (pages_used > goal_pages_used) {
+		if (fatal_signal_pending(current))
+			break;
+
+		pre_used = pages_used;
+		pages_reclaimed = shrink_page_cache_memcg(gfp_mask, memcg,
+						pages_used - goal_pages_used);
+
+		if (pages_reclaimed == -EINVAL)
+			return;
+
+		if (limit_retry_times == 0)
+			goto next_shrink;
+
+		if (pages_reclaimed == 0) {
+			io_schedule_timeout(HZ/10);
+			retry_times++;
+		} else
+			retry_times = 0;
+
+		if (retry_times > limit_retry_times) {
+			pr_warn("Attempts to recycle many times have not recovered enough pages.\n");
+			break;
+		}
+
+next_shrink:
+		pages_used = page_counter_read(&memcg->pagecache);
+		cond_resched();
+	}
+}
+
+static u64 pagecache_reclaim_ratio_read(struct cgroup_subsys_state *css,
+					struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return memcg->pagecache_reclaim_ratio;
+}
+
+static ssize_t pagecache_reclaim_ratio_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	u64 reclaim_ratio;
+	int ret;
+	unsigned long nr_pages;
+
+	if (!sysctl_vm_memory_qos) {
+		pr_warn("you should open vm.memory_qos.\n");
+		return -EINVAL;
+	}
+
+	if (vm_pagecache_limit_global) {
+		pr_warn("you should clear vm_pagecache_limit_global.\n");
+		return -EINVAL;
+	}
+
+	buf = strstrip(buf);
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtou64(buf, 0, &reclaim_ratio);
+	if (ret)
+		return ret;
+
+	if ((reclaim_ratio > 0) && (reclaim_ratio < 100)) {
+		memcg->pagecache_reclaim_ratio = reclaim_ratio;
+		mem_cgroup_shrink_pagecache(memcg, GFP_KERNEL);
+		return nbytes;
+	} else if (reclaim_ratio == 100) {
+		nr_pages = page_counter_read(&memcg->pagecache);
+
+		//try reclaim once
+		shrink_page_cache_memcg(GFP_KERNEL, memcg, nr_pages);
+		return nbytes;
+	}
+
+	return -EINVAL;
+}
+
 static u64 mem_cgroup_priority_oom_read(struct cgroup_subsys_state *css,
 					struct cftype *cft)
 {
@ -4226,6 +4354,134 @@ static int mem_cgroup_priority_oom_write(struct cgroup_subsys_state *css,
 	return 0;
 }

+static u64 pagecache_current_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return (u64)page_counter_read(&memcg->pagecache) * PAGE_SIZE;
+}
+
+static u64 memory_pagecache_max_read(struct cgroup_subsys_state *css,
+				struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+	return memcg->pagecache_max_ratio;
+}
+
+unsigned long mem_cgroup_pagecache_get_reclaim_pages(struct mem_cgroup *memcg)
+{
+	unsigned long goal_pages_used, pages_used, pages_max;
+
+	if ((!memcg) || (mem_cgroup_is_root(memcg)))
+		return 0;
+
+	pages_max = READ_ONCE(memcg->pagecache.max);
+	if (pages_max == PAGE_COUNTER_MAX)
+		return 0;
+
+	goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio))
+				* pages_max / 100;
+	goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES,
+				goal_pages_used);
+	pages_used = page_counter_read(&memcg->pagecache);
+
+	return pages_used > pages_max ? pages_used - goal_pages_used : 0;
+}
+
+static void pagecache_set_limit(struct mem_cgroup *memcg)
+{
+	unsigned long max, pages_max;
+	u32 max_ratio;
+
+	pages_max = READ_ONCE(memcg->memory.max);
+	max_ratio = READ_ONCE(memcg->pagecache_max_ratio);
+	max = ((pages_max * max_ratio) / 100);
+	xchg(&memcg->pagecache.max, max);
+}
+
+static ssize_t memory_pagecache_max_write(struct kernfs_open_file *of,
+				char *buf, size_t nbytes, loff_t off)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+	unsigned int nr_reclaims = vm_pagecache_limit_retry_times;
+	unsigned long max;
+	long pages_reclaimed;
+	int ret = 0;
+	u64 max_ratio, old;
+
+	if (!sysctl_vm_memory_qos) {
+		pr_warn("you should open vm.memory_qos.\n");
+		return -EINVAL;
+	}
+
+	if (vm_pagecache_limit_global) {
+		pr_warn("you should clear vm_pagecache_limit_global.\n");
+		return -EINVAL;
+	}
+
+	if (!buf)
+		return -EINVAL;
+
+	ret = kstrtou64(buf, 0, &max_ratio);
+	if (ret)
+		return ret;
+
+	if (max_ratio > PAGECACHE_MAX_RATIO_MAX ||
+		max_ratio < PAGECACHE_MAX_RATIO_MIN)
+		return -EINVAL;
+
+	if (READ_ONCE(memcg->memory.max) == PAGE_COUNTER_MAX) {
+		pr_warn("pagecache limit not allowed for cgroup without memory limit set\n");
+		return -EPERM;
+	}
+
+	old = READ_ONCE(memcg->pagecache_max_ratio);
+	memcg->pagecache_max_ratio = max_ratio;
+	pagecache_set_limit(memcg);
+	max = READ_ONCE(memcg->pagecache.max);
+
+	for (;;) {
+		unsigned long pages_used = page_counter_read(&memcg->pagecache);
+
+		if (pages_used <= max)
+			break;
+
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			break;
+		}
+
+		if (nr_reclaims) {
+			pages_reclaimed =
+				shrink_page_cache_memcg(GFP_KERNEL, memcg,
+				mem_cgroup_pagecache_get_reclaim_pages(memcg));
+
+			if (pages_reclaimed == -EINVAL) {
+				pr_warn("you should clear vm_pagecache_limit_global.\n");
+				return -EINVAL;
+			}
+
+			if (pages_reclaimed == 0) {
+				io_schedule_timeout(HZ/10);
+				nr_reclaims--;
+				cond_resched();
+			} else
+				nr_reclaims = vm_pagecache_limit_retry_times;
+
+			continue;
+		}
+
+		memcg->pagecache_max_ratio = old;
+		pagecache_set_limit(memcg);
+		pr_warn("Attempts to recycle many times have not recovered enough pages.\n");
+		return -EINVAL;
+	}
+
+	return ret ? : nbytes;
+}
+
 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	unsigned long val;
@ -6129,6 +6385,23 @@ static struct cftype mem_cgroup_legacy_files[] = {
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
+	{
+		.name = "pagecache.reclaim_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = pagecache_reclaim_ratio_read,
+		.write = pagecache_reclaim_ratio_write,
+	},
+	{
+		.name = "pagecache.max_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = memory_pagecache_max_read,
+		.write = memory_pagecache_max_write,
+	},
+	{
+		.name = "pagecache.current",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = pagecache_current_read,
+	},
 	{
 		.name = "use_priority_oom",
 		.write_u64 = mem_cgroup_priority_oom_write,
@ -6589,6 +6862,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)

 	page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
 	WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
+	memcg->pagecache_reclaim_ratio = DEFAULT_PAGE_RECLAIM_RATIO;
+	memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
 	memcg->zswap_max = PAGE_COUNTER_MAX;
 #endif
@ -6619,12 +6894,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 		page_counter_init(&memcg->swap, &parent->swap);
 		page_counter_init(&memcg->kmem, &parent->kmem);
 		page_counter_init(&memcg->tcpmem, &parent->tcpmem);
+		page_counter_init(&memcg->pagecache, &parent->pagecache);
 	} else {
 		init_memcg_events();
 		page_counter_init(&memcg->memory, NULL);
 		page_counter_init(&memcg->swap, NULL);
 		page_counter_init(&memcg->kmem, NULL);
 		page_counter_init(&memcg->tcpmem, NULL);
+		page_counter_init(&memcg->pagecache, NULL);
 	}

 	setup_async_wmark(memcg);
@ -6791,6 +7068,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 	page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
 	page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+	page_counter_set_max(&memcg->pagecache, PAGE_COUNTER_MAX);
 	page_counter_set_min(&memcg->memory, 0);
 	page_counter_set_low(&memcg->memory, 0);
 	page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX);
@ -7857,6 +8135,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
 	setup_async_wmark(memcg);
 	if (need_memcg_async_reclaim(memcg))
 		queue_work(memcg_async_reclaim_wq, &memcg->async_work);
+	pagecache_set_limit(memcg);

 	memcg_wb_domain_size_changed(memcg);
 	return nbytes;
@ -7872,6 +8151,10 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
 		   atomic_long_read(&events[MEMCG_OOM_KILL]));
 	seq_printf(m, "oom_group_kill %lu\n",
 		   atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
+	seq_printf(m, "pagecache_max %lu\n",
+		   atomic_long_read(&events[MEMCG_PAGECACHE_MAX]));
+	seq_printf(m, "pagecache_oom %lu\n",
+		   atomic_long_read(&events[MEMCG_PAGECACHE_OOM]));
 }

 static int memory_events_show(struct seq_file *m, void *v)
@ -8114,6 +8397,23 @@ static ssize_t memory_async_wmark_delta_write(struct kernfs_open_file *of,
 }

 static struct cftype memory_files[] = {
+	{
+		.name = "pagecache.reclaim_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = pagecache_reclaim_ratio_read,
+		.write = pagecache_reclaim_ratio_write,
+	},
+	{
+		.name = "pagecache.max_ratio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = memory_pagecache_max_read,
+		.write = memory_pagecache_max_write,
+	},
+	{
+		.name = "pagecache.current",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = pagecache_current_read,
+	},
 	{
 		.name = "current",
 		.flags = CFTYPE_NOT_ON_ROOT,
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@ -68,6 +68,7 @@
 #include <linux/swapops.h>
 #include <linux/balloon_compaction.h>
 #include <linux/sched/sysctl.h>
+#include <linux/cpumask.h>

 #include "internal.h"
 #include "swap.h"
@ -505,6 +506,24 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 	return false;
 }
 #else
+
+#define sysctl_vm_memory_qos 0
+
+/*
+ * Iteration constructs for visiting all cgroups (under a tree).  If
+ * loops are exited prematurely (break), mem_cgroup_iter_break() must
+ * be used for reference counting.
+ */
+#define for_each_mem_cgroup_tree(iter, root)		\
+	for (iter = mem_cgroup_iter(root, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(root, iter, NULL))
+
+#define for_each_mem_cgroup(iter)			\
+	for (iter = mem_cgroup_iter(NULL, NULL, NULL);	\
+	     iter != NULL;				\
+	     iter = mem_cgroup_iter(NULL, iter, NULL))
+
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
 	return -ENOSYS;
@ -7400,69 +7419,97 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages);
 #endif

-#ifdef CONFIG_PAGECACHE_LIMIT
 int vm_pagecache_limit_ratio __read_mostly;
 int vm_pagecache_limit_reclaim_ratio __read_mostly;
 unsigned long vm_pagecache_limit_pages __read_mostly;
 unsigned long vm_pagecache_limit_reclaim_pages __read_mostly;
 unsigned int vm_pagecache_ignore_dirty __read_mostly = 1;
 unsigned int vm_pagecache_limit_async __read_mostly;
+unsigned int vm_pagecache_limit_global __read_mostly;
 unsigned int vm_pagecache_ignore_slab __read_mostly = 1;
 static struct task_struct *kpclimitd;
 static bool kpclimitd_context;

+extern unsigned long vm_pagecache_system_usage;
+
+unsigned long __pagecache_over_limit(void)
+{
+	unsigned long pgcache_lru_pages = 0;
+	/*
+	 * We only want to limit unmapped and non-shmem page cache pages,
+	 * normally all shmem pages are mapped as well.
+	 */
+	unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES)
+				    - max_t(unsigned long,
+					    global_node_page_state(NR_FILE_MAPPED),
+					    global_node_page_state(NR_SHMEM));
+
+	/*
+	 * We certainly can't free more than what's on the LRU lists
+	 * minus the dirty ones.
+	 */
+	if (vm_pagecache_ignore_slab)
+		pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
+				  + global_node_page_state(NR_INACTIVE_FILE);
+	else
+		pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
+				  + global_node_page_state(NR_INACTIVE_FILE)
+				  + global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+				  + global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
+
+	if (vm_pagecache_ignore_dirty != 0)
+		pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY) /
+				     vm_pagecache_ignore_dirty;
+
+	/* Paranoia */
+	if (unlikely(pgcache_lru_pages > LONG_MAX))
+		return 0;
+
+	/* Limit it to 94% of LRU (not all there might be unmapped). */
+	pgcache_lru_pages -= pgcache_lru_pages / 16;
+	if (vm_pagecache_ignore_slab)
+		pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages);
+	else
+		pgcache_pages = pgcache_lru_pages;
+
+	return pgcache_pages;
+}
+
+int proc_pagecache_system_usage(struct ctl_table *table, int write,
+				  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	vm_pagecache_system_usage = __pagecache_over_limit();
+
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
+}
+
 /*
 * Returns a number that's positive if the pagecache is above
- * the set limit
+ * the set limit.
 */
 unsigned long pagecache_over_limit(void)
 {
 	unsigned long should_reclaim_pages = 0;
 	unsigned long overlimit_pages = 0;
 	unsigned long delta_pages = 0;
-	unsigned long pgcache_lru_pages = 0;
-	/* We only want to limit unmapped and non-shmem page cache pages;
-	 * normally all shmem pages are mapped as well*/
-	unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES)
-				    - max_t(unsigned long,
-					    global_node_page_state(NR_FILE_MAPPED),
-					    global_node_page_state(NR_SHMEM));
-	/* We certainly can't free more than what's on the LRU lists
-	 * minus the dirty ones*/
-	if (vm_pagecache_ignore_slab)
-		pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
-					+ global_node_page_state(NR_INACTIVE_FILE);
-	else
-		pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
-					+ global_node_page_state(NR_INACTIVE_FILE)
-					+ global_node_page_state(NR_SLAB_RECLAIMABLE_B)
-					+ global_node_page_state(NR_SLAB_UNRECLAIMABLE_B);
+	unsigned long pgcache_pages = 0;

-	if (vm_pagecache_ignore_dirty != 0)
-		pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY)
-				     /vm_pagecache_ignore_dirty;
-	/* Paranoia */
-	if (unlikely(pgcache_lru_pages > LONG_MAX))
-		return 0;
-
-	/* Limit it to 94% of LRU (not all there might be unmapped) */
-	pgcache_lru_pages -= pgcache_lru_pages/16;
-	if (vm_pagecache_ignore_slab)
-		pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages);
-	else
-		pgcache_pages = pgcache_lru_pages;
+	pgcache_pages = __pagecache_over_limit();

 	/*
-	*delta_pages: we should reclaim at least 2% more pages than overlimit_page, values get from
-	*		/proc/vm/pagecache_limit_reclaim_pages
-	*should_reclaim_pages: the real pages we will reclaim, but it should less than pgcache_pages;
-	*/
+	 * delta_pages: we should reclaim at least 2% more pages than overlimit_page,
+	 * values get from /proc/vm/pagecache_limit_reclaim_pages.
+	 * should_reclaim_pages: the real pages we will reclaim,
+	 * but it should less than pgcache_pages.
+	 */
 	if (pgcache_pages > vm_pagecache_limit_pages) {
 		overlimit_pages = pgcache_pages - vm_pagecache_limit_pages;
 		delta_pages = vm_pagecache_limit_reclaim_pages - vm_pagecache_limit_pages;
-		should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages) + overlimit_pages;
+		should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages)
+				     + overlimit_pages;
 		return should_reclaim_pages;
 	}
+
 	return 0;
 }

@ -7648,7 +7695,8 @@ out:
 * This function is similar to shrink_all_memory, except that it may never
 * swap out mapped pages and only does four passes.
 */
-static void __shrink_page_cache(gfp_t mask)
+static unsigned long __shrink_page_cache(gfp_t mask, struct mem_cgroup *memcg,
+					 unsigned long nr_pages)
 {
 	unsigned long ret = 0;
 	int pass = 0;
@ -7660,11 +7708,10 @@ static void __shrink_page_cache(gfp_t mask)
 		.may_unmap = 0,
 		.may_writepage = 0,
 		.may_deactivate = DEACTIVATE_FILE,
-		.target_mem_cgroup = NULL,
+		.target_mem_cgroup = memcg,
 		.reclaim_idx = MAX_NR_ZONES,
 	};
 	struct reclaim_state *old_rs = current->reclaim_state;
-	long nr_pages;

 	/* We might sleep during direct reclaim so make atomic context
 	 * is certainly a bug.
@ -7672,9 +7719,6 @@ static void __shrink_page_cache(gfp_t mask)
 	BUG_ON(!(mask & __GFP_RECLAIM));

 retry:
-	/* How many pages are we over the limit?*/
-	nr_pages = pagecache_over_limit();
-
 	/*
 	 * Return early if there's no work to do.
 	 * Wake up reclaimers that couldn't scan any zone due to congestion.
@ -7682,7 +7726,7 @@ retry:
 	 * This makes sure that no sleeping reclaimer will stay behind.
 	 * Allow breaching the limit if the task is on the way out.
 	 */
-	if (nr_pages <= 0 || fatal_signal_pending(current)) {
+	if (nr_pages == 0 || fatal_signal_pending(current)) {
 		wake_up_interruptible(&pagecache_reclaim_wq);
 		goto out;
 	}
@ -7719,9 +7763,10 @@ retry:
 				goto out;

 			for_each_online_node(nid) {
-				struct mem_cgroup *memcg = NULL;
-				while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL)
-					shrink_slab(mask, nid, memcg, sc.priority);
+				struct mem_cgroup *iter;
+
+				for_each_mem_cgroup_tree(iter, memcg)
+					shrink_slab(mask, nid, iter, sc.priority);
 			}
 			ret += reclaim_state.reclaimed;
 			reclaim_state.reclaimed = 0;
@ -7741,8 +7786,11 @@ retry:

 out:
 	current->reclaim_state = old_rs;
+	return sc.nr_reclaimed;
 }

+void batch_shrink_page_cache(gfp_t mask);
+
 static int kpagecache_limitd(void *data)
 {
 	DEFINE_WAIT(wait);
@ -7755,7 +7803,9 @@ static int kpagecache_limitd(void *data)
 		wake_up_interruptible(&pagecache_reclaim_wq);

 	for (;;) {
-		__shrink_page_cache(GFP_KERNEL);
+		if (pagecache_limit_should_shrink())
+			batch_shrink_page_cache(GFP_KERNEL);
+
 		prepare_to_wait(&kpagecache_limitd_wq, &wait, TASK_INTERRUPTIBLE);

 		if (!kthread_should_stop())
@ -7777,14 +7827,66 @@ static void wakeup_kpclimitd(gfp_t mask)
 	wake_up_interruptible(&kpagecache_limitd_wq);
 }

+void batch_shrink_page_cache(gfp_t mask)
+{
+	int reclaim_ratio, goal, retry_limit = 10, retry = 0;
+	unsigned long goals, currents, batchs, reclaims, reclaimed;
+	int tmp_reclaim_ratio = vm_pagecache_limit_reclaim_ratio;
+	int tmp_limit_ratio = vm_pagecache_limit_ratio;
+
+	reclaim_ratio = max_t(int, tmp_reclaim_ratio - tmp_limit_ratio,
+						ADDITIONAL_RECLAIM_RATIO);
+	goal = tmp_limit_ratio - reclaim_ratio;
+	if (goal <= 0)
+		return;
+
+	reclaims = reclaim_ratio * totalram_pages() / 100;
+	if (vm_pagecache_limit_async == 0)
+		batchs = reclaims / num_online_cpus();
+	else
+		batchs = reclaims;
+	goals = goal * totalram_pages() / 100;
+	currents = __pagecache_over_limit();
+
+	while (currents > goals) {
+		if (fatal_signal_pending(current))
+			break;
+
+		reclaimed = __shrink_page_cache(mask, NULL, batchs);
+		if (reclaimed == 0) {
+			io_schedule_timeout(HZ/10);
+			retry++;
+		} else
+			retry = 0;
+
+		if (retry > retry_limit)
+			break;
+
+		currents = __pagecache_over_limit();
+		cond_resched();
+	}
+}
+
 void shrink_page_cache(gfp_t mask, struct page *page)
 {
-	if (0 == vm_pagecache_limit_async)
-		__shrink_page_cache(mask);
+	if (!sysctl_vm_memory_qos || !vm_pagecache_limit_global)
+		return;
+
+	if (vm_pagecache_limit_async == 0)
+		batch_shrink_page_cache(mask);
 	else
 		wakeup_kpclimitd(mask);
 }

+long shrink_page_cache_memcg(gfp_t mask, struct mem_cgroup *memcg,
+			     unsigned long nr_pages)
+{
+	if (!vm_pagecache_limit_global)
+		return __shrink_page_cache(mask, memcg, nr_pages);
+
+	return -EINVAL;
+}
+
 int kpagecache_limitd_run(void)
 {
 	int ret = 0;
@ -7809,7 +7911,6 @@ void kpagecache_limitd_stop(void)
 		kpclimitd = NULL;
 	}
 }
-#endif /* CONFIG_PAGECACHE_LIMIT */

 static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
 {
@ -8053,6 +8154,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
 		.order = order,
 		.may_unmap = 1,
 	};
+	unsigned long nr_pages;

 	set_task_reclaim_state(current, &sc.reclaim_state);
 	psi_memstall_enter(&pflags);
@ -8060,11 +8162,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)

 	count_vm_event(PAGEOUTRUN);

-#ifdef CONFIG_PAGECACHE_LIMIT
 	/* This reclaims from all zones so don't count to sc.nr_reclaimed */
-	if (pagecache_limit_should_shrink())
-		__shrink_page_cache(GFP_KERNEL);
-#endif /* CONFIG_PAGECACHE_LIMIT */
+	if (pagecache_limit_should_shrink()) {
+		nr_pages = pagecache_over_limit();
+		if (nr_pages)
+			shrink_page_cache(GFP_KERNEL, NULL);
+	}

 	/*
 	 * Account for the reclaim boost. Note that the zone boost is left in