anolis: mm: unevictable: add memcg granularity core implementation

commit 179050e6e6a2758db0549e42e3e5841050271488 openAnolis

Conflicts: add memory.text_unevictable_size to display the size of
	code segments.
	Picked from 5.4.

Backport-reason: Add code segment unevictable feature support [PATCH 7/8]

ANBZ: #2674

This patch introduces the core implementation about code section
unevictable with memcg granularity, after this patch, the unevictable
of memcg granularity has been realized.

With this patch, we mainly implemented the global switch
"/sys/kernel/mm/unevictable/enabled" to enable or disable this function,
and we also implemented the "memory.allow_text_unevictable" and
"memory.text_unevictable_percent" to enable and limit the size of code
section unevictable on per memcg.

About the unevictable text size, you can check through memory.exstat
interface.

Signed-off-by: Xin Hao <xhao@linux.alibaba.com>
Reviewed-by: Xu Yu <xuyu@linux.alibaba.com>
Link: https://gitee.com/anolis/cloud-kernel/pulls/953
Signed-off-by: Xin Hao <vernhao@tencent.com>
This commit is contained in:
linuszeng 2024-04-03 16:31:08 +08:00 committed by Kairui Song
parent 19dac875c2
commit 03165bd0f4
5 changed files with 295 additions and 2 deletions

View File

@ -352,6 +352,12 @@ struct mem_cgroup {
#ifdef CONFIG_TEXT_UNEVICTABLE
bool allow_unevictable;
unsigned int unevictable_percent;
/*
* the unevictable_size is larger than the real unevictable memory
* size, due to there may be multiple tasks sharing the same memory,
* such as binary and dynamic library sharing.
*/
atomic_long_t unevictable_size;
#endif
KABI_RESERVE(1);

View File

@ -3,6 +3,8 @@
#ifndef _TEXT_UNEVICTABLE_H
#define _TEXT_UNEVICTABLE_H
struct mem_cgroup;
#ifdef CONFIG_TEXT_UNEVICTABLE
DECLARE_STATIC_KEY_FALSE(unevictable_enabled_key);
@ -10,10 +12,56 @@ static inline bool unevictable_enabled(void)
{
return static_branch_unlikely(&unevictable_enabled_key);
}
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg);
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size);
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size);
bool is_unevictable_size_overflow(struct mem_cgroup *memcg);
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg);
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to);
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset);
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable);
void del_unevict_task(struct task_struct *tsk);
void clean_task_unevict_size(struct task_struct *tsk);
#else
static inline bool unevictable_enabled(void)
{
return false;
}
static inline bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
{
return false;
}
static inline void memcg_increase_unevict_size(struct mem_cgroup *memcg,
unsigned long size)
{
}
static inline void memcg_decrease_unevict_size(struct mem_cgroup *memcg,
unsigned long size)
{
}
static inline bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
{
return false;
}
static inline unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
{
return 0;
}
static inline void mem_cgroup_can_unevictable(struct task_struct *tsk,
struct mem_cgroup *to)
{
}
static inline void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
{
}
static inline void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
{
}
static inline void del_unevict_task(struct task_struct *tsk)
{
}
static inline void clean_task_unevict_size(struct task_struct *tsk)
{
}
#endif
#endif

View File

@ -69,6 +69,9 @@
#include <linux/rethook.h>
#include <linux/sysfs.h>
#include <linux/user_events.h>
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#include <linux/uaccess.h>
#include <asm/unistd.h>
@ -856,6 +859,9 @@ void __noreturn do_exit(long code)
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
#ifdef CONFIG_TEXT_UNEVICTABLE
clean_task_unevict_size(tsk);
#endif
exit_mm();
if (group_dead)

View File

@ -70,6 +70,9 @@
#include <net/ip.h>
#include "slab.h"
#include "swap.h"
#ifdef CONFIG_TEXT_UNEVICTABLE
#include <linux/unevictable.h>
#endif
#include <linux/uaccess.h>
@ -4214,6 +4217,18 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
#endif
}
#ifdef CONFIG_TEXT_UNEVICTABLE
static int memcg_unevict_size_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
seq_printf(m, "unevictable_text_size_kb %lu\n",
memcg_exstat_text_unevict_gather(memcg) >> 10);
return 0;
}
#endif
static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@ -5413,6 +5428,10 @@ static int mem_cgroup_allow_unevictable_write(struct cgroup_subsys_state *css,
return 0;
memcg->allow_unevictable = val;
if (val)
memcg_all_processes_unevict(memcg, true);
else
memcg_all_processes_unevict(memcg, false);
return 0;
}
@ -5455,6 +5474,10 @@ static struct cftype mem_cgroup_legacy_files[] = {
.read_u64 = mem_cgroup_unevictable_percent_read,
.write_u64 = mem_cgroup_unevictable_percent_write,
},
{
.name = "text_unevictable_size",
.seq_show = memcg_unevict_size_show,
},
#endif
{
.name = "max_usage_in_bytes",
@ -5870,6 +5893,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg->unevictable_percent = 100;
atomic_long_set(&memcg->unevictable_size, 0);
#endif
if (parent) {
#ifdef CONFIG_TEXT_UNEVICTABLE
@ -6674,6 +6698,10 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
if (!p)
return 0;
#ifdef CONFIG_TEXT_UNEVICTABLE
mem_cgroup_can_unevictable(p, memcg);
#endif
/*
* We are now committed to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
@ -6717,6 +6745,9 @@ static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
#ifdef CONFIG_TEXT_UNEVICTABLE
mem_cgroup_cancel_unevictable(tset);
#endif
if (mc.to)
mem_cgroup_clear_mc();
}

View File

@ -40,6 +40,11 @@
#ifdef CONFIG_TEXT_UNEVICTABLE
DEFINE_STATIC_KEY_FALSE(unevictable_enabled_key);
#define for_each_mem_cgroup(iter) \
for (iter = mem_cgroup_iter(NULL, NULL, NULL); \
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
#endif
struct evict_pids_t {
@ -51,6 +56,9 @@ struct evict_pid_entry {
struct list_head list;
pid_t rootpid;
u64 start_time;
#ifdef CONFIG_TEXT_UNEVICTABLE
u64 unevict_size;
#endif
struct task_struct *tsk;
bool done;
};
@ -102,6 +110,10 @@ static void __evict_pid(struct evict_pid_entry *pid)
if (!(mm->def_flags & VM_LOCKED)) {
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t flag;
#ifdef CONFIG_TEXT_UNEVICTABLE
unsigned long size = 0;
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
#endif
VMA_ITERATOR(vmi, mm, 0);
mmap_write_lock(mm);
@ -113,10 +125,18 @@ static void __evict_pid(struct evict_pid_entry *pid)
flag = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
mlock_fixup(&vmi, vma, &prev,
vma->vm_start, vma->vm_end, flag);
#ifdef CONFIG_TEXT_UNEVICTABLE
size += vma->vm_end - vma->vm_start;
#endif
}
}
mmap_write_unlock(mm);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg_decrease_unevict_size(memcg, size);
css_put(&memcg->css);
pid->unevict_size -= size;
#endif
}
mmput(mm);
}
@ -250,6 +270,9 @@ static void add_unevict_task(struct task_struct *tsk)
if (!result) {
result = new_entry;
result->rootpid = rootpid;
#ifdef CONFIG_TEXT_UNEVICTABLE
result->unevict_size = 0;
#endif
rb_link_node(&result->node, parent, link);
rb_insert_color(&result->node, &base_tree->root);
list_add_tail(&result->list, &pid_list);
@ -294,6 +317,12 @@ static void unevict_pid(pid_t pid)
get_task_struct(tsk);
rcu_read_unlock();
#ifdef CONFIG_TEXT_UNEVICTABLE
if (is_memcg_unevictable_enabled(mem_cgroup_from_task(tsk))) {
put_task_struct(tsk);
return;
}
#endif
add_unevict_task(tsk);
put_task_struct(tsk);
}
@ -431,6 +460,9 @@ static void execute_vm_lock(struct work_struct *unused)
mm = get_task_mm(tsk);
if (mm && !(mm->def_flags & VM_LOCKED)) {
#ifdef CONFIG_TEXT_UNEVICTABLE
struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
#endif
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t flag;
@ -438,6 +470,10 @@ static void execute_vm_lock(struct work_struct *unused)
mmap_write_lock(mm);
for_each_vma(vmi, vma) {
#ifdef CONFIG_TEXT_UNEVICTABLE
if (is_unevictable_size_overflow(memcg))
break;
#endif
if (vma->vm_file &&
(vma->vm_flags & VM_EXEC) &&
(vma->vm_flags & VM_READ)) {
@ -445,6 +481,9 @@ static void execute_vm_lock(struct work_struct *unused)
flag |= (VM_LOCKED | VM_LOCKONFAULT);
mlock_fixup(&vmi, vma, &prev,
vma->vm_start, vma->vm_end, flag);
#ifdef CONFIG_TEXT_UNEVICTABLE
result->unevict_size += vma->vm_end - vma->vm_start;
#endif
}
}
@ -452,6 +491,11 @@ static void execute_vm_lock(struct work_struct *unused)
result->start_time = tsk->start_boottime;
result->done = true;
mmap_write_unlock(mm);
#ifdef CONFIG_TEXT_UNEVICTABLE
memcg_increase_unevict_size(memcg,
result->unevict_size);
css_put(&memcg->css);
#endif
} else {
list_del(&result->list);
__remove_entry(result);
@ -546,6 +590,163 @@ const static struct proc_ops del_proc_fops = {
};
#ifdef CONFIG_TEXT_UNEVICTABLE
void clean_task_unevict_size(struct task_struct *tsk)
{
struct evict_pid_entry *result;
struct mem_cgroup *memcg;
/*
* There must make sure unevictable
* function is finished.
*/
if (!tsk || !base_tree)
return;
mutex_lock(&pid_mutex);
result = lookup_unevict_entry(tsk);
if (result) {
if (result->unevict_size) {
rcu_read_lock();
memcg = mem_cgroup_from_task(tsk);
memcg_decrease_unevict_size(memcg, result->unevict_size);
rcu_read_unlock();
}
list_del(&result->list);
__remove_entry(result);
mutex_unlock(&pid_mutex);
kfree(result);
} else
mutex_unlock(&pid_mutex);
}
bool is_memcg_unevictable_enabled(struct mem_cgroup *memcg)
{
if (!unevictable_enabled())
return false;
if (!memcg)
return false;
if (memcg->allow_unevictable)
return true;
return false;
}
void memcg_increase_unevict_size(struct mem_cgroup *memcg, unsigned long size)
{
atomic_long_add(size, &memcg->unevictable_size);
}
void memcg_decrease_unevict_size(struct mem_cgroup *memcg, unsigned long size)
{
atomic_long_sub(size, &memcg->unevictable_size);
}
bool is_unevictable_size_overflow(struct mem_cgroup *memcg)
{
struct page_counter *counter;
u64 res_limit;
u64 size;
counter = &memcg->memory;
res_limit = (u64)counter->max * PAGE_SIZE;
size = atomic_long_read(&memcg->unevictable_size);
size = size * 100 / res_limit;
if (size >= memcg->unevictable_percent)
return true;
return false;
}
unsigned long memcg_exstat_text_unevict_gather(struct mem_cgroup *memcg)
{
return atomic_long_read(&memcg->unevictable_size);
}
void mem_cgroup_can_unevictable(struct task_struct *tsk, struct mem_cgroup *to)
{
struct mem_cgroup *from;
if (!unevictable_enabled())
return;
from = mem_cgroup_from_task(tsk);
VM_BUG_ON(from == to);
if (to->allow_unevictable && !from->allow_unevictable) {
add_unevict_task(tsk);
schedule_delayed_work(&evict_work, HZ);
}
if (!to->allow_unevictable && from->allow_unevictable)
del_unevict_task(tsk);
}
void mem_cgroup_cancel_unevictable(struct cgroup_taskset *tset)
{
struct task_struct *tsk;
struct cgroup_subsys_state *dst_css;
struct mem_cgroup *memcg;
if (!unevictable_enabled())
return;
cgroup_taskset_for_each(tsk, dst_css, tset) {
memcg = mem_cgroup_from_task(tsk);
if (memcg->allow_unevictable)
del_unevict_task(tsk);
}
}
static inline int schedule_unevict_task(struct task_struct *tsk, void *arg)
{
add_unevict_task(tsk);
schedule_delayed_work(&evict_work, HZ);
return 0;
}
static inline int schedule_evict_task(struct task_struct *tsk, void *arg)
{
del_unevict_task(tsk);
return 0;
}
static inline void make_all_memcg_evictable(void)
{
struct mem_cgroup *memcg;
for_each_mem_cgroup(memcg) {
if (!memcg->allow_unevictable)
continue;
mem_cgroup_scan_tasks(memcg, schedule_unevict_task, NULL);
memcg->allow_unevictable = 0;
memcg->unevictable_percent = 100;
atomic_long_set(&memcg->unevictable_size, 0);
}
}
void memcg_all_processes_unevict(struct mem_cgroup *memcg, bool enable)
{
struct mem_cgroup *tmp_memcg;
if (!unevictable_enabled())
return;
if (!memcg)
tmp_memcg = root_mem_cgroup;
else
tmp_memcg = memcg;
if (enable)
mem_cgroup_scan_tasks(tmp_memcg, schedule_unevict_task, NULL);
else
mem_cgroup_scan_tasks(tmp_memcg, schedule_evict_task, NULL);
}
static int __init setup_unevictable(char *s)
{
if (!strcmp(s, "1"))
@ -573,9 +774,10 @@ static ssize_t unevictable_enabled_store(struct kobject *kobj,
if (!strncmp(buf, "1", 1))
static_branch_enable(&unevictable_enabled_key);
else if (!strncmp(buf, "0", 1))
else if (!strncmp(buf, "0", 1)) {
static_branch_disable(&unevictable_enabled_key);
else
make_all_memcg_evictable();
} else
ret = -EINVAL;
mutex_unlock(&mutex);