rue/mm: pagecache limit per cgroup support

Functional test:
http://tapd.oa.com/TencentOS_QoS/prong/stories/view/
1020426664867405667?jump_count=1

Signed-off-by: Xiaoguang Chen <xiaoggchen@tencent.com>
Signed-off-by: Jingxiang Zeng <linuszeng@tencent.com>
Signed-off-by: Xuan Liu <benxliu@tencent.com>
Signed-off-by: Honglin Li <honglinli@tencent.com>
This commit is contained in:
Honglin Li 2023-09-01 15:22:09 +08:00 committed by Haisu Wang
parent 56d80c4ea2
commit 75ad2bae3d
10 changed files with 545 additions and 96 deletions

View File

@ -140,7 +140,6 @@ CONFIG_DAMON_SYSFS=y
CONFIG_DAMON_DBGFS=y
CONFIG_DAMON_RECLAIM=y
CONFIG_DAMON_LRU_SORT=y
CONFIG_PAGECACHE_LIMIT=y
CONFIG_NET=y
CONFIG_PACKET=y
CONFIG_PACKET_DIAG=m

View File

@ -55,6 +55,8 @@ enum memcg_memory_event {
MEMCG_SWAP_HIGH,
MEMCG_SWAP_MAX,
MEMCG_SWAP_FAIL,
MEMCG_PAGECACHE_MAX,
MEMCG_PAGECACHE_OOM,
MEMCG_NR_MEMORY_EVENTS,
};
@ -237,6 +239,10 @@ struct mem_cgroup {
struct page_counter memsw; /* v1 only */
};
struct page_counter pagecache;
u64 pagecache_reclaim_ratio;
u32 pagecache_max_ratio;
/* Legacy consumer-oriented counters */
struct page_counter kmem; /* v1 only */
struct page_counter tcpmem; /* v1 only */
@ -403,6 +409,21 @@ struct mem_cgroup {
*/
#define MEMCG_CHARGE_BATCH 64U
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
extern struct mem_cgroup *root_mem_cgroup;
enum page_memcg_data_flags {
@ -1841,6 +1862,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg);
void free_shrinker_info(struct mem_cgroup *memcg);
void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id);
void reparent_shrinker_deferred(struct mem_cgroup *memcg);
extern int sysctl_vm_memory_qos;
extern unsigned int vm_pagecache_limit_retry_times;
extern void
mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask);
#else
#define mem_cgroup_sockets_enabled 0
static inline void mem_cgroup_sk_alloc(struct sock *sk) { };

View File

@ -847,7 +847,6 @@ struct zone {
*/
long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_PAGECACHE_LIMIT
/*
* This atomic counter is set when there is pagecache limit
* reclaim going on on this particular zone. Other potential
@ -855,7 +854,6 @@ struct zone {
* bouncing.
*/
atomic_t pagecache_reclaim;
#endif
#ifdef CONFIG_NUMA
int node;

View File

@ -407,15 +407,19 @@ extern unsigned long zone_reclaimable_pages(struct zone *zone);
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask, nodemask_t *mask);
#ifdef CONFIG_PAGECACHE_LIMIT
#define ADDITIONAL_RECLAIM_RATIO 2
extern int vm_pagecache_limit_ratio;
extern int vm_pagecache_limit_reclaim_ratio;
extern unsigned long vm_pagecache_limit_pages;
extern unsigned long vm_pagecache_limit_reclaim_pages;
extern unsigned int vm_pagecache_ignore_dirty;
extern unsigned int vm_pagecache_limit_async;
extern unsigned int vm_pagecache_limit_global;
extern unsigned int vm_pagecache_ignore_slab;
extern long shrink_page_cache_memcg(gfp_t mask, struct mem_cgroup *memcg,
unsigned long nr_pages);
extern unsigned long __pagecache_over_limit(void);
extern unsigned long pagecache_over_limit(void);
extern int kpagecache_limitd_run(void);
extern void kpagecache_limitd_stop(void);
@ -424,15 +428,6 @@ static inline bool pagecache_limit_should_shrink(void)
{
return unlikely(vm_pagecache_limit_pages) && pagecache_over_limit();
}
#else
extern inline void shrink_page_cache(gfp_t mask, struct page *page)
{
}
static inline bool pagecache_limit_should_shrink(void)
{
return 0;
}
#endif
#define MEMCG_RECLAIM_MAY_SWAP (1 << 1)
#define MEMCG_RECLAIM_PROACTIVE (1 << 2)

View File

@ -87,6 +87,8 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *,
int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *);
int netcls_do_large_bitmap(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);
int proc_pagecache_system_usage(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
int proc_do_static_key(struct ctl_table *table, int write, void *buffer,
size_t *lenp, loff_t *ppos);

View File

@ -592,8 +592,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
buffer, lenp, ppos, conv, data);
}
#ifdef CONFIG_PAGECACHE_LIMIT
#define ADDITIONAL_RECLAIM_RATIO 2
static int setup_pagecache_limit(void)
{
/* reclaim ADDITIONAL_RECLAIM_PAGES more than limit. */
@ -661,7 +659,6 @@ static int pc_limit_async_handler(struct ctl_table *table, int write,
return ret;
}
#endif /* CONFIG_PAGECACHE_LIMIT */
static int do_proc_douintvec_w(unsigned int *tbl_data,
struct ctl_table *table,
@ -2609,6 +2606,8 @@ static struct ctl_table kern_table[] = {
{ }
};
unsigned long vm_pagecache_system_usage;
static struct ctl_table vm_table[] = {
{
.procname = "overcommit_memory",
@ -2852,7 +2851,6 @@ static struct ctl_table vm_table[] = {
.extra2 = (void *)&mmap_rnd_compat_bits_max,
},
#endif
#ifdef CONFIG_PAGECACHE_LIMIT
{
.procname = "pagecache_limit_ratio",
.data = &vm_pagecache_limit_ratio,
@ -2892,8 +2890,32 @@ static struct ctl_table vm_table[] = {
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif /* CONFIG_PAGECACHE_LIMIT */
#ifdef CONFIG_MEMCG
{
.procname = "pagecache_limit_global",
.data = &vm_pagecache_limit_global,
.maxlen = sizeof(vm_pagecache_limit_global),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "pagecache_limit_retry_times",
.data = &vm_pagecache_limit_retry_times,
.maxlen = sizeof(vm_pagecache_limit_retry_times),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_MAXOLDUID,
},
{
.procname = "pagecache_system_usage",
.data = &vm_pagecache_system_usage,
.maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = proc_pagecache_system_usage,
},
{
.procname = "memory_qos",
.data = &sysctl_vm_memory_qos,

View File

@ -1271,14 +1271,6 @@ config LOCK_MM_AND_FIND_VMA
source "mm/damon/Kconfig"
config PAGECACHE_LIMIT
bool "Page cache limit"
help
This options allow user to set a limit for the page cache.
For details, see Documentation/mm/pagecache-limit.
If unsure, say N.
config ENHANCED_MM
bool "Enable enhanced mm support (EMM)"
depends on MEMCG

View File

@ -848,6 +848,9 @@ noinline int __filemap_add_folio(struct address_space *mapping,
int huge = folio_test_hugetlb(folio);
bool charged = false;
long nr = 1;
#ifdef CONFIG_MEMCG
struct mem_cgroup *memcg;
#endif
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
@ -861,6 +864,14 @@ noinline int __filemap_add_folio(struct address_space *mapping,
charged = true;
xas_set_order(&xas, index, folio_order(folio));
nr = folio_nr_pages(folio);
#ifdef CONFIG_MEMCG
/* For a successful charge, folio->memcg_data must be set. */
memcg = folio_memcg(folio);
for (; memcg; memcg = parent_mem_cgroup(memcg))
mem_cgroup_shrink_pagecache(memcg, gfp);
#endif
}
gfp &= GFP_RECLAIM_MASK;

View File

@ -112,6 +112,11 @@ static bool cgroup_memory_nobpf __ro_after_init;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
#define MEMCG_PAGECACHE_RETRIES 20
#define DEFAULT_PAGE_RECLAIM_RATIO 5
#define PAGECACHE_MAX_RATIO_MIN 5
#define PAGECACHE_MAX_RATIO_MAX 100
int sysctl_vm_memory_qos;
/* default has none reclaim priority */
int sysctl_vm_qos_highest_reclaim_prio = CGROUP_PRIORITY_MAX;
@ -254,21 +259,6 @@ enum res_type {
#define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@ -890,6 +880,13 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
memcg_rstat_updated(memcg, val);
if (idx == NR_FILE_PAGES) {
if (val > 0)
page_counter_charge(&memcg->pagecache, val);
else
page_counter_uncharge(&memcg->pagecache, -val);
}
memcg_stats_unlock();
}
@ -4005,6 +4002,8 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
}
#endif
static void pagecache_set_limit(struct mem_cgroup *memcg);
static DEFINE_MUTEX(memcg_max_mutex);
static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
@ -4062,6 +4061,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
if (enlarge)
memcg_oom_recover(memcg);
pagecache_set_limit(memcg);
}
return ret;
@ -4206,6 +4206,134 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return -EINVAL;
}
#define MIN_PAGECACHE_PAGES 16
unsigned int
vm_pagecache_limit_retry_times __read_mostly = MEMCG_PAGECACHE_RETRIES;
void mem_cgroup_shrink_pagecache(struct mem_cgroup *memcg, gfp_t gfp_mask)
{
long pages_reclaimed;
unsigned long pages_used, pages_max, goal_pages_used, pre_used;
unsigned int retry_times = 0;
unsigned int limit_retry_times;
u32 max_ratio;
if (!sysctl_vm_memory_qos || vm_pagecache_limit_global)
return;
if (!memcg || mem_cgroup_is_root(memcg))
return;
max_ratio = READ_ONCE(memcg->pagecache_max_ratio);
if (max_ratio == PAGECACHE_MAX_RATIO_MAX)
return;
pages_max = READ_ONCE(memcg->pagecache.max);
if (pages_max == PAGE_COUNTER_MAX)
return;
if (unlikely(task_is_dying()))
return;
if (unlikely(current->flags & PF_MEMALLOC))
return;
if (unlikely(task_in_memcg_oom(current)))
return;
if (!gfpflags_allow_blocking(gfp_mask))
return;
pages_used = page_counter_read(&memcg->pagecache);
limit_retry_times = READ_ONCE(vm_pagecache_limit_retry_times);
goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio))
* pages_max / 100;
goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES,
goal_pages_used);
if (pages_used > pages_max)
memcg_memory_event(memcg, MEMCG_PAGECACHE_MAX);
while (pages_used > goal_pages_used) {
if (fatal_signal_pending(current))
break;
pre_used = pages_used;
pages_reclaimed = shrink_page_cache_memcg(gfp_mask, memcg,
pages_used - goal_pages_used);
if (pages_reclaimed == -EINVAL)
return;
if (limit_retry_times == 0)
goto next_shrink;
if (pages_reclaimed == 0) {
io_schedule_timeout(HZ/10);
retry_times++;
} else
retry_times = 0;
if (retry_times > limit_retry_times) {
pr_warn("Attempts to recycle many times have not recovered enough pages.\n");
break;
}
next_shrink:
pages_used = page_counter_read(&memcg->pagecache);
cond_resched();
}
}
static u64 pagecache_reclaim_ratio_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->pagecache_reclaim_ratio;
}
static ssize_t pagecache_reclaim_ratio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
u64 reclaim_ratio;
int ret;
unsigned long nr_pages;
if (!sysctl_vm_memory_qos) {
pr_warn("you should open vm.memory_qos.\n");
return -EINVAL;
}
if (vm_pagecache_limit_global) {
pr_warn("you should clear vm_pagecache_limit_global.\n");
return -EINVAL;
}
buf = strstrip(buf);
if (!buf)
return -EINVAL;
ret = kstrtou64(buf, 0, &reclaim_ratio);
if (ret)
return ret;
if ((reclaim_ratio > 0) && (reclaim_ratio < 100)) {
memcg->pagecache_reclaim_ratio = reclaim_ratio;
mem_cgroup_shrink_pagecache(memcg, GFP_KERNEL);
return nbytes;
} else if (reclaim_ratio == 100) {
nr_pages = page_counter_read(&memcg->pagecache);
//try reclaim once
shrink_page_cache_memcg(GFP_KERNEL, memcg, nr_pages);
return nbytes;
}
return -EINVAL;
}
static u64 mem_cgroup_priority_oom_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@ -4226,6 +4354,134 @@ static int mem_cgroup_priority_oom_write(struct cgroup_subsys_state *css,
return 0;
}
static u64 pagecache_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return (u64)page_counter_read(&memcg->pagecache) * PAGE_SIZE;
}
static u64 memory_pagecache_max_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
return memcg->pagecache_max_ratio;
}
unsigned long mem_cgroup_pagecache_get_reclaim_pages(struct mem_cgroup *memcg)
{
unsigned long goal_pages_used, pages_used, pages_max;
if ((!memcg) || (mem_cgroup_is_root(memcg)))
return 0;
pages_max = READ_ONCE(memcg->pagecache.max);
if (pages_max == PAGE_COUNTER_MAX)
return 0;
goal_pages_used = (100 - READ_ONCE(memcg->pagecache_reclaim_ratio))
* pages_max / 100;
goal_pages_used = max_t(unsigned long, MIN_PAGECACHE_PAGES,
goal_pages_used);
pages_used = page_counter_read(&memcg->pagecache);
return pages_used > pages_max ? pages_used - goal_pages_used : 0;
}
static void pagecache_set_limit(struct mem_cgroup *memcg)
{
unsigned long max, pages_max;
u32 max_ratio;
pages_max = READ_ONCE(memcg->memory.max);
max_ratio = READ_ONCE(memcg->pagecache_max_ratio);
max = ((pages_max * max_ratio) / 100);
xchg(&memcg->pagecache.max, max);
}
static ssize_t memory_pagecache_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_reclaims = vm_pagecache_limit_retry_times;
unsigned long max;
long pages_reclaimed;
int ret = 0;
u64 max_ratio, old;
if (!sysctl_vm_memory_qos) {
pr_warn("you should open vm.memory_qos.\n");
return -EINVAL;
}
if (vm_pagecache_limit_global) {
pr_warn("you should clear vm_pagecache_limit_global.\n");
return -EINVAL;
}
if (!buf)
return -EINVAL;
ret = kstrtou64(buf, 0, &max_ratio);
if (ret)
return ret;
if (max_ratio > PAGECACHE_MAX_RATIO_MAX ||
max_ratio < PAGECACHE_MAX_RATIO_MIN)
return -EINVAL;
if (READ_ONCE(memcg->memory.max) == PAGE_COUNTER_MAX) {
pr_warn("pagecache limit not allowed for cgroup without memory limit set\n");
return -EPERM;
}
old = READ_ONCE(memcg->pagecache_max_ratio);
memcg->pagecache_max_ratio = max_ratio;
pagecache_set_limit(memcg);
max = READ_ONCE(memcg->pagecache.max);
for (;;) {
unsigned long pages_used = page_counter_read(&memcg->pagecache);
if (pages_used <= max)
break;
if (fatal_signal_pending(current)) {
ret = -EINTR;
break;
}
if (nr_reclaims) {
pages_reclaimed =
shrink_page_cache_memcg(GFP_KERNEL, memcg,
mem_cgroup_pagecache_get_reclaim_pages(memcg));
if (pages_reclaimed == -EINVAL) {
pr_warn("you should clear vm_pagecache_limit_global.\n");
return -EINVAL;
}
if (pages_reclaimed == 0) {
io_schedule_timeout(HZ/10);
nr_reclaims--;
cond_resched();
} else
nr_reclaims = vm_pagecache_limit_retry_times;
continue;
}
memcg->pagecache_max_ratio = old;
pagecache_set_limit(memcg);
pr_warn("Attempts to recycle many times have not recovered enough pages.\n");
return -EINVAL;
}
return ret ? : nbytes;
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
unsigned long val;
@ -6129,6 +6385,23 @@ static struct cftype mem_cgroup_legacy_files[] = {
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
{
.name = "pagecache.reclaim_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_reclaim_ratio_read,
.write = pagecache_reclaim_ratio_write,
},
{
.name = "pagecache.max_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_pagecache_max_read,
.write = memory_pagecache_max_write,
},
{
.name = "pagecache.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_current_read,
},
{
.name = "use_priority_oom",
.write_u64 = mem_cgroup_priority_oom_write,
@ -6589,6 +6862,8 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX);
memcg->pagecache_reclaim_ratio = DEFAULT_PAGE_RECLAIM_RATIO;
memcg->pagecache_max_ratio = PAGECACHE_MAX_RATIO_MAX;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
memcg->zswap_max = PAGE_COUNTER_MAX;
#endif
@ -6619,12 +6894,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
page_counter_init(&memcg->pagecache, &parent->pagecache);
} else {
init_memcg_events();
page_counter_init(&memcg->memory, NULL);
page_counter_init(&memcg->swap, NULL);
page_counter_init(&memcg->kmem, NULL);
page_counter_init(&memcg->tcpmem, NULL);
page_counter_init(&memcg->pagecache, NULL);
}
setup_async_wmark(memcg);
@ -6791,6 +7068,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_max(&memcg->pagecache, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
page_counter_set_async_high(&memcg->memory, PAGE_COUNTER_MAX);
@ -7857,6 +8135,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
setup_async_wmark(memcg);
if (need_memcg_async_reclaim(memcg))
queue_work(memcg_async_reclaim_wq, &memcg->async_work);
pagecache_set_limit(memcg);
memcg_wb_domain_size_changed(memcg);
return nbytes;
@ -7872,6 +8151,10 @@ static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
atomic_long_read(&events[MEMCG_OOM_KILL]));
seq_printf(m, "oom_group_kill %lu\n",
atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
seq_printf(m, "pagecache_max %lu\n",
atomic_long_read(&events[MEMCG_PAGECACHE_MAX]));
seq_printf(m, "pagecache_oom %lu\n",
atomic_long_read(&events[MEMCG_PAGECACHE_OOM]));
}
static int memory_events_show(struct seq_file *m, void *v)
@ -8114,6 +8397,23 @@ static ssize_t memory_async_wmark_delta_write(struct kernfs_open_file *of,
}
static struct cftype memory_files[] = {
{
.name = "pagecache.reclaim_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_reclaim_ratio_read,
.write = pagecache_reclaim_ratio_write,
},
{
.name = "pagecache.max_ratio",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = memory_pagecache_max_read,
.write = memory_pagecache_max_write,
},
{
.name = "pagecache.current",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = pagecache_current_read,
},
{
.name = "current",
.flags = CFTYPE_NOT_ON_ROOT,

View File

@ -68,6 +68,7 @@
#include <linux/swapops.h>
#include <linux/balloon_compaction.h>
#include <linux/sched/sysctl.h>
#include <linux/cpumask.h>
#include "internal.h"
#include "swap.h"
@ -505,6 +506,24 @@ static bool writeback_throttling_sane(struct scan_control *sc)
return false;
}
#else
#define sysctl_vm_memory_qos 0
/*
* Iteration constructs for visiting all cgroups (under a tree). If
* loops are exited prematurely (break), mem_cgroup_iter_break() must
* be used for reference counting.
*/
#define for_each_mem_cgroup_tree(iter, root) \
for (iter = mem_cgroup_iter(root, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(root, iter, NULL))
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
static int prealloc_memcg_shrinker(struct shrinker *shrinker)
{
return -ENOSYS;
@ -7400,69 +7419,97 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages);
#endif
#ifdef CONFIG_PAGECACHE_LIMIT
int vm_pagecache_limit_ratio __read_mostly;
int vm_pagecache_limit_reclaim_ratio __read_mostly;
unsigned long vm_pagecache_limit_pages __read_mostly;
unsigned long vm_pagecache_limit_reclaim_pages __read_mostly;
unsigned int vm_pagecache_ignore_dirty __read_mostly = 1;
unsigned int vm_pagecache_limit_async __read_mostly;
unsigned int vm_pagecache_limit_global __read_mostly;
unsigned int vm_pagecache_ignore_slab __read_mostly = 1;
static struct task_struct *kpclimitd;
static bool kpclimitd_context;
extern unsigned long vm_pagecache_system_usage;
unsigned long __pagecache_over_limit(void)
{
unsigned long pgcache_lru_pages = 0;
/*
* We only want to limit unmapped and non-shmem page cache pages,
* normally all shmem pages are mapped as well.
*/
unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES)
- max_t(unsigned long,
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM));
/*
* We certainly can't free more than what's on the LRU lists
* minus the dirty ones.
*/
if (vm_pagecache_ignore_slab)
pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
+ global_node_page_state(NR_INACTIVE_FILE);
else
pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
+ global_node_page_state(NR_INACTIVE_FILE)
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B);
if (vm_pagecache_ignore_dirty != 0)
pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY) /
vm_pagecache_ignore_dirty;
/* Paranoia */
if (unlikely(pgcache_lru_pages > LONG_MAX))
return 0;
/* Limit it to 94% of LRU (not all there might be unmapped). */
pgcache_lru_pages -= pgcache_lru_pages / 16;
if (vm_pagecache_ignore_slab)
pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages);
else
pgcache_pages = pgcache_lru_pages;
return pgcache_pages;
}
int proc_pagecache_system_usage(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
vm_pagecache_system_usage = __pagecache_over_limit();
return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
/*
* Returns a number that's positive if the pagecache is above
* the set limit
* the set limit.
*/
unsigned long pagecache_over_limit(void)
{
unsigned long should_reclaim_pages = 0;
unsigned long overlimit_pages = 0;
unsigned long delta_pages = 0;
unsigned long pgcache_lru_pages = 0;
/* We only want to limit unmapped and non-shmem page cache pages;
* normally all shmem pages are mapped as well*/
unsigned long pgcache_pages = global_node_page_state(NR_FILE_PAGES)
- max_t(unsigned long,
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM));
/* We certainly can't free more than what's on the LRU lists
* minus the dirty ones*/
if (vm_pagecache_ignore_slab)
pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
+ global_node_page_state(NR_INACTIVE_FILE);
else
pgcache_lru_pages = global_node_page_state(NR_ACTIVE_FILE)
+ global_node_page_state(NR_INACTIVE_FILE)
+ global_node_page_state(NR_SLAB_RECLAIMABLE_B)
+ global_node_page_state(NR_SLAB_UNRECLAIMABLE_B);
unsigned long pgcache_pages = 0;
if (vm_pagecache_ignore_dirty != 0)
pgcache_lru_pages -= global_node_page_state(NR_FILE_DIRTY)
/vm_pagecache_ignore_dirty;
/* Paranoia */
if (unlikely(pgcache_lru_pages > LONG_MAX))
return 0;
/* Limit it to 94% of LRU (not all there might be unmapped) */
pgcache_lru_pages -= pgcache_lru_pages/16;
if (vm_pagecache_ignore_slab)
pgcache_pages = min_t(unsigned long, pgcache_pages, pgcache_lru_pages);
else
pgcache_pages = pgcache_lru_pages;
pgcache_pages = __pagecache_over_limit();
/*
*delta_pages: we should reclaim at least 2% more pages than overlimit_page, values get from
* /proc/vm/pagecache_limit_reclaim_pages
*should_reclaim_pages: the real pages we will reclaim, but it should less than pgcache_pages;
*/
* delta_pages: we should reclaim at least 2% more pages than overlimit_page,
* values get from /proc/vm/pagecache_limit_reclaim_pages.
* should_reclaim_pages: the real pages we will reclaim,
* but it should less than pgcache_pages.
*/
if (pgcache_pages > vm_pagecache_limit_pages) {
overlimit_pages = pgcache_pages - vm_pagecache_limit_pages;
delta_pages = vm_pagecache_limit_reclaim_pages - vm_pagecache_limit_pages;
should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages) + overlimit_pages;
should_reclaim_pages = min_t(unsigned long, delta_pages, vm_pagecache_limit_pages)
+ overlimit_pages;
return should_reclaim_pages;
}
return 0;
}
@ -7648,7 +7695,8 @@ out:
* This function is similar to shrink_all_memory, except that it may never
* swap out mapped pages and only does four passes.
*/
static void __shrink_page_cache(gfp_t mask)
static unsigned long __shrink_page_cache(gfp_t mask, struct mem_cgroup *memcg,
unsigned long nr_pages)
{
unsigned long ret = 0;
int pass = 0;
@ -7660,11 +7708,10 @@ static void __shrink_page_cache(gfp_t mask)
.may_unmap = 0,
.may_writepage = 0,
.may_deactivate = DEACTIVATE_FILE,
.target_mem_cgroup = NULL,
.target_mem_cgroup = memcg,
.reclaim_idx = MAX_NR_ZONES,
};
struct reclaim_state *old_rs = current->reclaim_state;
long nr_pages;
/* We might sleep during direct reclaim so make atomic context
* is certainly a bug.
@ -7672,9 +7719,6 @@ static void __shrink_page_cache(gfp_t mask)
BUG_ON(!(mask & __GFP_RECLAIM));
retry:
/* How many pages are we over the limit?*/
nr_pages = pagecache_over_limit();
/*
* Return early if there's no work to do.
* Wake up reclaimers that couldn't scan any zone due to congestion.
@ -7682,7 +7726,7 @@ retry:
* This makes sure that no sleeping reclaimer will stay behind.
* Allow breaching the limit if the task is on the way out.
*/
if (nr_pages <= 0 || fatal_signal_pending(current)) {
if (nr_pages == 0 || fatal_signal_pending(current)) {
wake_up_interruptible(&pagecache_reclaim_wq);
goto out;
}
@ -7719,9 +7763,10 @@ retry:
goto out;
for_each_online_node(nid) {
struct mem_cgroup *memcg = NULL;
while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL)
shrink_slab(mask, nid, memcg, sc.priority);
struct mem_cgroup *iter;
for_each_mem_cgroup_tree(iter, memcg)
shrink_slab(mask, nid, iter, sc.priority);
}
ret += reclaim_state.reclaimed;
reclaim_state.reclaimed = 0;
@ -7741,8 +7786,11 @@ retry:
out:
current->reclaim_state = old_rs;
return sc.nr_reclaimed;
}
void batch_shrink_page_cache(gfp_t mask);
static int kpagecache_limitd(void *data)
{
DEFINE_WAIT(wait);
@ -7755,7 +7803,9 @@ static int kpagecache_limitd(void *data)
wake_up_interruptible(&pagecache_reclaim_wq);
for (;;) {
__shrink_page_cache(GFP_KERNEL);
if (pagecache_limit_should_shrink())
batch_shrink_page_cache(GFP_KERNEL);
prepare_to_wait(&kpagecache_limitd_wq, &wait, TASK_INTERRUPTIBLE);
if (!kthread_should_stop())
@ -7777,14 +7827,66 @@ static void wakeup_kpclimitd(gfp_t mask)
wake_up_interruptible(&kpagecache_limitd_wq);
}
void batch_shrink_page_cache(gfp_t mask)
{
int reclaim_ratio, goal, retry_limit = 10, retry = 0;
unsigned long goals, currents, batchs, reclaims, reclaimed;
int tmp_reclaim_ratio = vm_pagecache_limit_reclaim_ratio;
int tmp_limit_ratio = vm_pagecache_limit_ratio;
reclaim_ratio = max_t(int, tmp_reclaim_ratio - tmp_limit_ratio,
ADDITIONAL_RECLAIM_RATIO);
goal = tmp_limit_ratio - reclaim_ratio;
if (goal <= 0)
return;
reclaims = reclaim_ratio * totalram_pages() / 100;
if (vm_pagecache_limit_async == 0)
batchs = reclaims / num_online_cpus();
else
batchs = reclaims;
goals = goal * totalram_pages() / 100;
currents = __pagecache_over_limit();
while (currents > goals) {
if (fatal_signal_pending(current))
break;
reclaimed = __shrink_page_cache(mask, NULL, batchs);
if (reclaimed == 0) {
io_schedule_timeout(HZ/10);
retry++;
} else
retry = 0;
if (retry > retry_limit)
break;
currents = __pagecache_over_limit();
cond_resched();
}
}
void shrink_page_cache(gfp_t mask, struct page *page)
{
if (0 == vm_pagecache_limit_async)
__shrink_page_cache(mask);
if (!sysctl_vm_memory_qos || !vm_pagecache_limit_global)
return;
if (vm_pagecache_limit_async == 0)
batch_shrink_page_cache(mask);
else
wakeup_kpclimitd(mask);
}
long shrink_page_cache_memcg(gfp_t mask, struct mem_cgroup *memcg,
unsigned long nr_pages)
{
if (!vm_pagecache_limit_global)
return __shrink_page_cache(mask, memcg, nr_pages);
return -EINVAL;
}
int kpagecache_limitd_run(void)
{
int ret = 0;
@ -7809,7 +7911,6 @@ void kpagecache_limitd_stop(void)
kpclimitd = NULL;
}
}
#endif /* CONFIG_PAGECACHE_LIMIT */
static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
{
@ -8053,6 +8154,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
.order = order,
.may_unmap = 1,
};
unsigned long nr_pages;
set_task_reclaim_state(current, &sc.reclaim_state);
psi_memstall_enter(&pflags);
@ -8060,11 +8162,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
count_vm_event(PAGEOUTRUN);
#ifdef CONFIG_PAGECACHE_LIMIT
/* This reclaims from all zones so don't count to sc.nr_reclaimed */
if (pagecache_limit_should_shrink())
__shrink_page_cache(GFP_KERNEL);
#endif /* CONFIG_PAGECACHE_LIMIT */
if (pagecache_limit_should_shrink()) {
nr_pages = pagecache_over_limit();
if (nr_pages)
shrink_page_cache(GFP_KERNEL, NULL);
}
/*
* Account for the reclaim boost. Note that the zone boost is left in