rue/io: add bufio isolation based for cgroup v1

Add buffer IO isolation bind_blkio based on v2 infrastructure to v1,
so we can unify the interface for dio and bufio.

Add sysctl switch to allow migrate already bind cgroup.

Signed-off-by: Haisu Wang <haisuwang@tencent.com>
Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Lenny Chen <lennychen@tencent.com>
This commit is contained in:
Haisu Wang 2021-04-27 20:10:19 +08:00
parent 1b1b938068
commit a12bb1a43d
6 changed files with 172 additions and 6 deletions

View File

@ -174,9 +174,7 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
cgroup_subsys_on_dfl(io_cgrp_subsys) &&
(bdi->capabilities & BDI_CAP_WRITEBACK) &&
return (bdi->capabilities & BDI_CAP_WRITEBACK) &&
(inode->i_sb->s_iflags & SB_I_CGROUPWB);
}

View File

@ -412,6 +412,10 @@ struct mem_cgroup {
atomic_long_t unevictable_size;
#endif
/* attach a blkio with memcg for cgroup v1 */
struct cgroup_subsys_state *bind_blkio;
char *bind_blkio_path;
KABI_RESERVE(1);
KABI_RESERVE(2);
KABI_RESERVE(3);

View File

@ -60,6 +60,8 @@
#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>
#include <linux/blk-cgroup.h>
#include <linux/rue.h>
#ifdef CONFIG_CGROUP_SLI
#include <linux/sli.h>
@ -2796,6 +2798,10 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
* using cgroup_migrate(), cgroup_migrate_finish() must be called on
* @mgctx.
*/
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
unsigned int sysctl_allow_memcg_migrate_ignore_blkio_bind = 1;
#endif
int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
{
struct css_set *src_cset, *tmp_cset;
@ -2807,6 +2813,9 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
struct cgroup_subsys_state *css;
#endif
int ssid;
dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp);
@ -2815,6 +2824,21 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset);
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
css = dst_cset->subsys[memory_cgrp_id];
if (rue_io_enabled() &&
!sysctl_allow_memcg_migrate_ignore_blkio_bind && css) {
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
css = dst_cset->subsys[io_cgrp_id];
if (css && memcg->bind_blkio && css != blkcg_root_css &&
memcg->bind_blkio != css) {
pr_err("memcg already bind blkio, disallow migrate");
return -EPERM;
}
}
#endif
/*
* If src cset equals dst, it's noop. Drop the src.
* cgroup_migrate() will skip the cset too. Note that we

View File

@ -171,6 +171,10 @@ extern int sysctl_vm_ramdisk_swaptune;
extern int sysctl_vm_swapcache_fastfree;
#endif
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
extern unsigned int sysctl_allow_memcg_migrate_ignore_blkio_bind __read_mostly;
#endif
#ifdef CONFIG_TKERNEL_SECURITY_MONITOR
unsigned long connect_info_flag;
unsigned long accept_info_flag;
@ -2669,6 +2673,17 @@ static struct ctl_table kern_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
{
.procname = "allow_memcg_migrate_ignore_blkio_bind",
.data = &sysctl_allow_memcg_migrate_ignore_blkio_bind,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif
{ }
};

View File

@ -16,6 +16,7 @@
#include <linux/writeback.h>
#include <linux/device.h>
#include <trace/events/writeback.h>
#include <linux/rue.h>
#include "internal.h"
struct backing_dev_info noop_backing_dev_info;
@ -583,7 +584,15 @@ static int cgwb_create(struct backing_dev_info *bdi,
int ret = 0;
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
else {
if (memcg->bind_blkio && rue_io_enabled())
blkcg_css = memcg->bind_blkio;
else
blkcg_css = blkcg_root_css;
css_get(blkcg_css);
}
memcg_cgwb_list = &memcg->cgwb_list;
blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
@ -702,9 +711,19 @@ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
if (wb) {
struct cgroup_subsys_state *blkcg_css;
struct mem_cgroup *memcg;
/* see whether the blkcg association has changed */
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
memcg = mem_cgroup_from_css(memcg_css);
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
else {
if (memcg->bind_blkio && rue_io_enabled())
blkcg_css = memcg->bind_blkio;
else
blkcg_css = blkcg_root_css;
css_get(blkcg_css);
}
if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
wb = NULL;
css_put(blkcg_css);

View File

@ -65,6 +65,7 @@
#include <linux/seq_buf.h>
#include <linux/emm.h>
#include <linux/sched/isolation.h>
#include <linux/namei.h>
#ifdef CONFIG_CGROUP_SLI
#include <linux/sli.h>
#endif
@ -406,7 +407,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_folio(struct folio *folio)
{
struct mem_cgroup *memcg = folio_memcg(folio);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
if (!memcg)
memcg = root_mem_cgroup;
return &memcg->css;
@ -5477,6 +5478,27 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
READ_ONCE(memcg->memory.high));
unsigned long used = page_counter_read(&memcg->memory);
/*
* Create a cgroup hierarchy a/{b, c}, b and c have no limit set,
* or the limit of b + c is greater than the limit of a. Then if
* the task of b does buffer IO first, this will cause the
* available memory of a to be greatly reduced. When the subsequent
* task of c is started, the available memory of a is small,
* resulting in the dirty page waterline of c being very small,
* the IO of c is suppressed and cannot be delivered normally.
* Since the file cache in b is recyclable, we can try to reclaim it,
* so when calculating the headroom, the available memory of a
* includes the file cache of a.
*/
if (memcg != mem_cgroup_from_css(wb->memcg_css)) {
unsigned long file, dirty, writeback;
file = memcg_page_state(memcg, NR_FILE_PAGES);
dirty = memcg_page_state(memcg, NR_FILE_DIRTY);
writeback = memcg_page_state(memcg, NR_WRITEBACK);
used -= file - dirty - writeback;
}
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
memcg = parent;
}
@ -6200,6 +6222,78 @@ static int mem_cgroup_vmstat_read(struct seq_file *m, void *vv)
return mem_cgroup_vmstat_read_comm(m, vv, memcg);
}
static ssize_t mem_cgroup_bind_blkio_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
struct cgroup_subsys_state *css;
struct path path;
char *pbuf;
int ret;
if (!rue_io_enabled())
return -EPERM;
buf = strstrip(buf);
/* alloc memory outside mutex */
pbuf = kzalloc(PATH_MAX, GFP_KERNEL);
if (!pbuf)
return -ENOMEM;
strscpy(pbuf, buf, PATH_MAX - 1);
mutex_lock(&memcg_max_mutex);
if (memcg->bind_blkio) {
WARN_ON(!memcg->bind_blkio_path);
kfree(memcg->bind_blkio_path);
memcg->bind_blkio_path = NULL;
css_put(memcg->bind_blkio);
memcg->bind_blkio = NULL;
wb_memcg_offline(memcg);
INIT_LIST_HEAD(&memcg->cgwb_list);
}
if (!strnlen(buf, PATH_MAX)) {
mutex_unlock(&memcg_max_mutex);
kfree(pbuf);
return nbytes;
}
ret = kern_path(pbuf, LOOKUP_FOLLOW, &path);
if (ret)
goto err;
css = css_tryget_online_from_dir(path.dentry, &io_cgrp_subsys);
if (IS_ERR(css)) {
ret = PTR_ERR(css);
path_put(&path);
goto err;
}
path_put(&path);
memcg->bind_blkio_path = pbuf;
memcg->bind_blkio = css;
mutex_unlock(&memcg_max_mutex);
return nbytes;
err:
kfree(pbuf);
mutex_unlock(&memcg_max_mutex);
return ret;
}
static int mem_cgroup_bind_blkio_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
if (memcg->bind_blkio_path)
seq_printf(m, "%s\n", memcg->bind_blkio_path);
return 0;
}
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft);
static int memory_low_show(struct seq_file *m, void *v);
@ -7212,6 +7306,12 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
static_branch_dec(&memcg_bpf_enabled_key);
#endif
if (memcg->bind_blkio) {
WARN_ON(!memcg->bind_blkio_path);
kfree(memcg->bind_blkio_path);
css_put(memcg->bind_blkio);
}
vmpressure_cleanup(&memcg->vmpressure);
cancel_work_sync(&memcg->high_work);
cancel_work_sync(&memcg->async_work);
@ -9679,6 +9779,12 @@ static struct cftype memsw_files[] = {
.write = mem_cgroup_reset,
.read_u64 = mem_cgroup_read_u64,
},
{
.name = "bind_blkio",
.flags = CFTYPE_NOT_ON_ROOT,
.write = mem_cgroup_bind_blkio_write,
.seq_show = mem_cgroup_bind_blkio_show,
},
{ }, /* terminate */
};