rue/io: add bufio isolation based for cgroup v1
Add buffer IO isolation bind_blkio based on v2 infrastructure to v1, so we can unify the interface for dio and bufio. Add sysctl switch to allow migrate already bind cgroup. Signed-off-by: Haisu Wang <haisuwang@tencent.com> Signed-off-by: Chunguang Xu <brookxu@tencent.com> Signed-off-by: Lenny Chen <lennychen@tencent.com>
This commit is contained in:
parent
1b1b938068
commit
a12bb1a43d
|
@ -174,9 +174,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
|
|||
{
|
||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
|
||||
return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
|
||||
cgroup_subsys_on_dfl(io_cgrp_subsys) &&
|
||||
(bdi->capabilities & BDI_CAP_WRITEBACK) &&
|
||||
return (bdi->capabilities & BDI_CAP_WRITEBACK) &&
|
||||
(inode->i_sb->s_iflags & SB_I_CGROUPWB);
|
||||
}
|
||||
|
||||
|
|
|
@ -412,6 +412,10 @@ struct mem_cgroup {
|
|||
atomic_long_t unevictable_size;
|
||||
#endif
|
||||
|
||||
/* attach a blkio with memcg for cgroup v1 */
|
||||
struct cgroup_subsys_state *bind_blkio;
|
||||
char *bind_blkio_path;
|
||||
|
||||
KABI_RESERVE(1);
|
||||
KABI_RESERVE(2);
|
||||
KABI_RESERVE(3);
|
||||
|
|
|
@ -60,6 +60,8 @@
|
|||
#include <linux/sched/deadline.h>
|
||||
#include <linux/psi.h>
|
||||
#include <net/sock.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/rue.h>
|
||||
|
||||
#ifdef CONFIG_CGROUP_SLI
|
||||
#include <linux/sli.h>
|
||||
|
@ -2796,6 +2798,10 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
|
|||
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
|
||||
* @mgctx.
|
||||
*/
|
||||
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
|
||||
unsigned int sysctl_allow_memcg_migrate_ignore_blkio_bind = 1;
|
||||
#endif
|
||||
|
||||
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
|
||||
{
|
||||
struct css_set *src_cset, *tmp_cset;
|
||||
|
@ -2807,6 +2813,9 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
|
|||
mg_src_preload_node) {
|
||||
struct css_set *dst_cset;
|
||||
struct cgroup_subsys *ss;
|
||||
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
|
||||
struct cgroup_subsys_state *css;
|
||||
#endif
|
||||
int ssid;
|
||||
|
||||
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
|
||||
|
@ -2815,6 +2824,21 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
|
|||
|
||||
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
|
||||
|
||||
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
|
||||
css = dst_cset->subsys[memory_cgrp_id];
|
||||
if (rue_io_enabled() &&
|
||||
!sysctl_allow_memcg_migrate_ignore_blkio_bind && css) {
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
|
||||
|
||||
css = dst_cset->subsys[io_cgrp_id];
|
||||
if (css && memcg->bind_blkio && css != blkcg_root_css &&
|
||||
memcg->bind_blkio != css) {
|
||||
pr_err("memcg already bind blkio, disallow migrate");
|
||||
return -EPERM;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* If src cset equals dst, it's noop. Drop the src.
|
||||
* cgroup_migrate() will skip the cset too. Note that we
|
||||
|
|
|
@ -171,6 +171,10 @@ extern int sysctl_vm_ramdisk_swaptune;
|
|||
extern int sysctl_vm_swapcache_fastfree;
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
|
||||
extern unsigned int sysctl_allow_memcg_migrate_ignore_blkio_bind __read_mostly;
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TKERNEL_SECURITY_MONITOR
|
||||
unsigned long connect_info_flag;
|
||||
unsigned long accept_info_flag;
|
||||
|
@ -2669,6 +2673,17 @@ static struct ctl_table kern_table[] = {
|
|||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
|
||||
{
|
||||
.procname = "allow_memcg_migrate_ignore_blkio_bind",
|
||||
.data = &sysctl_allow_memcg_migrate_ignore_blkio_bind,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif
|
||||
{ }
|
||||
};
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include <linux/writeback.h>
|
||||
#include <linux/device.h>
|
||||
#include <trace/events/writeback.h>
|
||||
#include <linux/rue.h>
|
||||
#include "internal.h"
|
||||
|
||||
struct backing_dev_info noop_backing_dev_info;
|
||||
|
@ -583,7 +584,15 @@ static int cgwb_create(struct backing_dev_info *bdi,
|
|||
int ret = 0;
|
||||
|
||||
memcg = mem_cgroup_from_css(memcg_css);
|
||||
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
||||
else {
|
||||
if (memcg->bind_blkio && rue_io_enabled())
|
||||
blkcg_css = memcg->bind_blkio;
|
||||
else
|
||||
blkcg_css = blkcg_root_css;
|
||||
css_get(blkcg_css);
|
||||
}
|
||||
memcg_cgwb_list = &memcg->cgwb_list;
|
||||
blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
|
||||
|
||||
|
@ -702,9 +711,19 @@ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
|
|||
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
|
||||
if (wb) {
|
||||
struct cgroup_subsys_state *blkcg_css;
|
||||
struct mem_cgroup *memcg;
|
||||
|
||||
/* see whether the blkcg association has changed */
|
||||
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
||||
memcg = mem_cgroup_from_css(memcg_css);
|
||||
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
|
||||
else {
|
||||
if (memcg->bind_blkio && rue_io_enabled())
|
||||
blkcg_css = memcg->bind_blkio;
|
||||
else
|
||||
blkcg_css = blkcg_root_css;
|
||||
css_get(blkcg_css);
|
||||
}
|
||||
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
|
||||
wb = NULL;
|
||||
css_put(blkcg_css);
|
||||
|
|
108
mm/memcontrol.c
108
mm/memcontrol.c
|
@ -65,6 +65,7 @@
|
|||
#include <linux/seq_buf.h>
|
||||
#include <linux/emm.h>
|
||||
#include <linux/sched/isolation.h>
|
||||
#include <linux/namei.h>
|
||||
#ifdef CONFIG_CGROUP_SLI
|
||||
#include <linux/sli.h>
|
||||
#endif
|
||||
|
@ -406,7 +407,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
|
|||
{
|
||||
struct mem_cgroup *memcg = folio_memcg(folio);
|
||||
|
||||
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
|
||||
if (!memcg)
|
||||
memcg = root_mem_cgroup;
|
||||
|
||||
return &memcg->css;
|
||||
|
@ -5477,6 +5478,27 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
|
|||
READ_ONCE(memcg->memory.high));
|
||||
unsigned long used = page_counter_read(&memcg->memory);
|
||||
|
||||
/*
|
||||
* Create a cgroup hierarchy a/{b, c}, b and c have no limit set,
|
||||
* or the limit of b + c is greater than the limit of a. Then if
|
||||
* the task of b does buffer IO first, this will cause the
|
||||
* available memory of a to be greatly reduced. When the subsequent
|
||||
* task of c is started, the available memory of a is small,
|
||||
* resulting in the dirty page waterline of c being very small,
|
||||
* the IO of c is suppressed and cannot be delivered normally.
|
||||
* Since the file cache in b is recyclable, we can try to reclaim it,
|
||||
* so when calculating the headroom, the available memory of a
|
||||
* includes the file cache of a.
|
||||
*/
|
||||
if (memcg != mem_cgroup_from_css(wb->memcg_css)) {
|
||||
unsigned long file, dirty, writeback;
|
||||
|
||||
file = memcg_page_state(memcg, NR_FILE_PAGES);
|
||||
dirty = memcg_page_state(memcg, NR_FILE_DIRTY);
|
||||
writeback = memcg_page_state(memcg, NR_WRITEBACK);
|
||||
used -= file - dirty - writeback;
|
||||
}
|
||||
|
||||
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
|
||||
memcg = parent;
|
||||
}
|
||||
|
@ -6200,6 +6222,78 @@ static int mem_cgroup_vmstat_read(struct seq_file *m, void *vv)
|
|||
return mem_cgroup_vmstat_read_comm(m, vv, memcg);
|
||||
}
|
||||
|
||||
static ssize_t mem_cgroup_bind_blkio_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
|
||||
struct cgroup_subsys_state *css;
|
||||
struct path path;
|
||||
char *pbuf;
|
||||
int ret;
|
||||
|
||||
if (!rue_io_enabled())
|
||||
return -EPERM;
|
||||
|
||||
buf = strstrip(buf);
|
||||
|
||||
/* alloc memory outside mutex */
|
||||
pbuf = kzalloc(PATH_MAX, GFP_KERNEL);
|
||||
if (!pbuf)
|
||||
return -ENOMEM;
|
||||
strscpy(pbuf, buf, PATH_MAX - 1);
|
||||
|
||||
mutex_lock(&memcg_max_mutex);
|
||||
|
||||
if (memcg->bind_blkio) {
|
||||
WARN_ON(!memcg->bind_blkio_path);
|
||||
kfree(memcg->bind_blkio_path);
|
||||
memcg->bind_blkio_path = NULL;
|
||||
css_put(memcg->bind_blkio);
|
||||
memcg->bind_blkio = NULL;
|
||||
|
||||
wb_memcg_offline(memcg);
|
||||
INIT_LIST_HEAD(&memcg->cgwb_list);
|
||||
}
|
||||
|
||||
if (!strnlen(buf, PATH_MAX)) {
|
||||
mutex_unlock(&memcg_max_mutex);
|
||||
kfree(pbuf);
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
ret = kern_path(pbuf, LOOKUP_FOLLOW, &path);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
css = css_tryget_online_from_dir(path.dentry, &io_cgrp_subsys);
|
||||
if (IS_ERR(css)) {
|
||||
ret = PTR_ERR(css);
|
||||
path_put(&path);
|
||||
goto err;
|
||||
}
|
||||
path_put(&path);
|
||||
|
||||
memcg->bind_blkio_path = pbuf;
|
||||
memcg->bind_blkio = css;
|
||||
mutex_unlock(&memcg_max_mutex);
|
||||
return nbytes;
|
||||
|
||||
err:
|
||||
kfree(pbuf);
|
||||
mutex_unlock(&memcg_max_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int mem_cgroup_bind_blkio_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
|
||||
|
||||
if (memcg->bind_blkio_path)
|
||||
seq_printf(m, "%s\n", memcg->bind_blkio_path);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u64 memory_current_read(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft);
|
||||
static int memory_low_show(struct seq_file *m, void *v);
|
||||
|
@ -7212,6 +7306,12 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
|
|||
static_branch_dec(&memcg_bpf_enabled_key);
|
||||
#endif
|
||||
|
||||
if (memcg->bind_blkio) {
|
||||
WARN_ON(!memcg->bind_blkio_path);
|
||||
kfree(memcg->bind_blkio_path);
|
||||
css_put(memcg->bind_blkio);
|
||||
}
|
||||
|
||||
vmpressure_cleanup(&memcg->vmpressure);
|
||||
cancel_work_sync(&memcg->high_work);
|
||||
cancel_work_sync(&memcg->async_work);
|
||||
|
@ -9679,6 +9779,12 @@ static struct cftype memsw_files[] = {
|
|||
.write = mem_cgroup_reset,
|
||||
.read_u64 = mem_cgroup_read_u64,
|
||||
},
|
||||
{
|
||||
.name = "bind_blkio",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.write = mem_cgroup_bind_blkio_write,
|
||||
.seq_show = mem_cgroup_bind_blkio_show,
|
||||
},
|
||||
{ }, /* terminate */
|
||||
};
|
||||
|
||||
|
|
Loading…
Reference in New Issue