rue/io: add buffer IO writeback throtl for cgroup v1

Add buffer IO throttle for cgroup v1 base on dirty throttle,
Since the actual IO speed is not considered, this solution
may cause the continuous accumulation of dirty pages in the
IO performance bottleneck scenario, which will lead to the
deterioration of the isolation effect.

Note:
struct blkcg moved from block/blk-cgroup.h to
include/linux/blk-cgroup.h

Signed-off-by: Chunguang Xu <brookxu@tencent.com>
Signed-off-by: Lenny Chen <lennychen@tencent.com>
Signed-off-by: Haisu Wang <haisuwang@tencent.com>
This commit is contained in:
Haisu Wang 2023-09-16 02:50:46 +08:00
parent 286c5f95c6
commit 1860b51781
7 changed files with 137 additions and 123 deletions

View File

@ -31,6 +31,7 @@
#include <linux/part_stat.h>
#include <linux/percpu.h>
#include <linux/percpu_counter.h>
#include <linux/rue.h>
#include "blk.h"
#include "blk-cgroup.h"
#include "blk-ioprio.h"
@ -54,6 +55,8 @@ EXPORT_SYMBOL_GPL(blkcg_root);
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
EXPORT_SYMBOL_GPL(blkcg_root_css);
struct rue_io_module_ops rue_io_ops;
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */

View File

@ -91,50 +91,6 @@ struct blkcg_gq {
struct rcu_head rcu_head;
};
struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
refcount_t online_pin;
struct radix_tree_root blkg_tree;
struct blkcg_gq __rcu *blkg_hint;
struct hlist_head blkg_list;
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
/*
* List of updated percpu blkg_iostat_set's since the last flush.
*/
struct llist_head __percpu *lhead;
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
#ifdef CONFIG_BLK_CGROUP_DISKSTATS
unsigned int dkstats_on;
struct list_head dkstats_list;
struct blkcg_dkstats *dkstats_hint;
#endif
#ifdef CONFIG_BLK_DEV_THROTTLING_CGROUP_V1
struct percpu_counter nr_dirtied;
unsigned long bw_time_stamp;
unsigned long dirtied_stamp;
unsigned long dirty_ratelimit;
unsigned long long buffered_write_bps;
#endif
KABI_RESERVE(1);
KABI_RESERVE(2);
KABI_RESERVE(3);
KABI_RESERVE(4);
};
static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct blkcg, css) : NULL;
@ -566,33 +522,6 @@ static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
void blk_cgroup_bio_start(struct bio *bio);
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
#ifdef CONFIG_BLK_DEV_THROTTLING_CGROUP_V1
static inline uint64_t blkcg_buffered_write_bps(struct blkcg *blkcg)
{
return blkcg->buffered_write_bps;
}
static inline unsigned long blkcg_dirty_ratelimit(struct blkcg *blkcg)
{
return blkcg->dirty_ratelimit;
}
static inline struct blkcg *get_task_blkcg(struct task_struct *tsk)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
do {
css = kthread_blkcg();
if (!css)
css = task_css(tsk, io_cgrp_id);
} while (!css_tryget(css));
rcu_read_unlock();
return container_of(css, struct blkcg, css);
}
#endif
#else /* CONFIG_BLK_CGROUP */
struct blkg_policy_data {

View File

@ -2286,7 +2286,7 @@ again:
* Parent tg has already linked when throtl_hierarchy enabled.
* But if io_qos disabled, skip check parent and fire directly.
*/
if (!tg || (!sysctl_io_qos_enabled && throtl_hierarchy_enabled())) {
if (!tg || (!rue_io_enabled() && throtl_hierarchy_enabled())) {
bio_set_flag(bio, BIO_BPS_THROTTLED);
goto out_unlock;
}

View File

@ -15,6 +15,8 @@
*/
#include <linux/types.h>
#include <linux/cgroup.h>
#include <linux/rue.h>
struct bio;
struct cgroup_subsys_state;
@ -33,6 +35,92 @@ void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css);
struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio);
/*
* Upstream dec223c92a4 (blk-cgroup: move struct blkcg to block/blk-cgroup.h)
* move the struct blkcg to block/blk-cgroup.c
* Move it out for RUE module.
*/
struct blkcg {
struct cgroup_subsys_state css;
spinlock_t lock;
refcount_t online_pin;
struct radix_tree_root blkg_tree;
struct blkcg_gq __rcu *blkg_hint;
struct hlist_head blkg_list;
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
/*
* List of updated percpu blkg_iostat_set's since the last flush.
*/
struct llist_head __percpu *lhead;
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list;
#endif
#ifdef CONFIG_BLK_CGROUP_DISKSTATS
unsigned int dkstats_on;
struct list_head dkstats_list;
struct blkcg_dkstats *dkstats_hint;
#endif
#ifdef CONFIG_BLK_DEV_THROTTLING_CGROUP_V1
struct percpu_counter nr_dirtied;
unsigned long bw_time_stamp;
unsigned long dirtied_stamp;
unsigned long dirty_ratelimit;
unsigned long long buffered_write_bps;
#endif
KABI_RESERVE(1);
KABI_RESERVE(2);
KABI_RESERVE(3);
KABI_RESERVE(4);
};
struct rue_io_module_ops {
void (*blkcg_update_bandwidth)(struct blkcg *blkcg);
KABI_RESERVE(1);
KABI_RESERVE(2);
KABI_RESERVE(3);
KABI_RESERVE(4);
};
extern struct rue_io_module_ops rue_io_ops;
#ifdef CONFIG_BLK_DEV_THROTTLING_CGROUP_V1
static inline uint64_t blkcg_buffered_write_bps(struct blkcg *blkcg)
{
return blkcg->buffered_write_bps;
}
static inline unsigned long blkcg_dirty_ratelimit(struct blkcg *blkcg)
{
return blkcg->dirty_ratelimit;
}
static inline struct blkcg *get_task_blkcg(struct task_struct *tsk)
{
struct cgroup_subsys_state *css;
rcu_read_lock();
do {
css = kthread_blkcg();
if (!css)
css = task_css(tsk, io_cgrp_id);
} while (!css_tryget(css));
rcu_read_unlock();
return container_of(css, struct blkcg, css);
}
#endif
#else /* CONFIG_BLK_CGROUP */
#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))

View File

@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
#include <linux/blkdev.h>
struct rue_ops {
#ifdef CONFIG_CGROUP_NET_CLASSID
@ -14,6 +15,10 @@ struct rue_ops {
#ifdef CONFIG_MEMCG
struct rue_mem_ops *mem;
#endif
#ifdef CONFIG_BLK_CGROUP
struct rue_io_module_ops *io;
#endif
};
extern int sysctl_net_qos_enable;
@ -21,11 +26,16 @@ extern int sysctl_net_qos_enable;
extern int sysctl_vm_memory_qos;
extern struct rue_mem_ops mem_ops;
extern struct rue_io_module_ops io_ops;
extern struct rue_io_module_ops rue_io_ops;
extern bool rue_installed;
extern struct rue_ops *rue_mod_ops;
DECLARE_PER_CPU(long, nr_rue_calls);
extern struct mutex rue_mutex;
int rue_io_enabled(void);
int register_rue_ops(struct rue_ops *ops);
int try_unregister_rue_ops(void);
@ -37,6 +47,10 @@ int try_unregister_rue_ops(void);
#define RUE_MEM_FUNC(ops, func) ops->mem->func /* RUE MEM OPs */
#endif
#ifdef CONFIG_BLK_CGROUP
#define RUE_IO_FUNC(ops, func) ops->io->func /* RUE IO OPs */
#endif
#define RUE_FUNC(subsys, ops, func) RUE_##subsys##_FUNC(ops, func)
#define RUE_CALL_TYPE(subsys, func, retype, ...) \

View File

@ -4,6 +4,7 @@
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/delay.h>
#include <linux/blk-cgroup.h>
#include <linux/rue.h>
bool rue_installed;
@ -51,6 +52,16 @@ static int check_mem_patch_state(struct rue_ops *ops, bool state)
return 0;
}
static int check_io_patch_state(struct rue_ops *ops, bool state)
{
#ifdef CONFIG_BLK_CGROUP
if (state && !ops->io)
return -EINVAL;
#endif
return 0;
}
static int check_patch_state(struct rue_ops *ops)
{
int ret = 0;
@ -64,6 +75,10 @@ static int check_patch_state(struct rue_ops *ops)
if (ret)
return ret;
ret = check_io_patch_state(ops, state);
if (ret)
return ret;
return 0;
}
@ -115,3 +130,12 @@ out:
return ret;
}
EXPORT_SYMBOL(try_unregister_rue_ops);
/**
* rue_io_enabled - whether RUE IO feature enabled
*/
int rue_io_enabled(void)
{
return sysctl_io_qos_enabled && READ_ONCE(rue_installed);
}
EXPORT_SYMBOL(rue_io_enabled);

View File

@ -41,8 +41,9 @@
#include <linux/mm_inline.h>
#include <trace/events/writeback.h>
#include <linux/rue.h>
#include "internal.h"
#include "../block/blk-cgroup.h"
/*
* Sleep at most 200ms at a time in balance_dirty_pages().
@ -1642,55 +1643,6 @@ static long wb_min_pause(struct bdi_writeback *wb,
return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
}
#ifdef CONFIG_BLK_DEV_THROTTLING_CGROUP_V1
static void blkcg_update_dirty_ratelimit(struct blkcg *blkcg,
unsigned long dirtied,
unsigned long elapsed)
{
unsigned long long bps = blkcg_buffered_write_bps(blkcg);
unsigned long long ratelimit;
unsigned long dirty_rate;
dirty_rate = (dirtied - blkcg->dirtied_stamp) * HZ;
dirty_rate /= elapsed;
ratelimit = blkcg->dirty_ratelimit;
ratelimit *= div_u64(bps, dirty_rate + 1);
ratelimit = min(ratelimit, bps);
ratelimit >>= PAGE_SHIFT;
blkcg->dirty_ratelimit = (blkcg->dirty_ratelimit + ratelimit) / 2 + 1;
trace_blkcg_dirty_ratelimit(bps, dirty_rate, blkcg->dirty_ratelimit, ratelimit);
}
void blkcg_update_bandwidth(struct blkcg *blkcg)
{
unsigned long now = jiffies;
unsigned long dirtied;
unsigned long elapsed;
if (!blkcg)
return;
if (!spin_trylock(&blkcg->lock))
return;
elapsed = now - blkcg->bw_time_stamp;
dirtied = percpu_counter_read(&blkcg->nr_dirtied);
if (elapsed > MAX_PAUSE * 2)
goto snapshot;
if (elapsed <= MAX_PAUSE)
goto unlock;
blkcg_update_dirty_ratelimit(blkcg, dirtied, elapsed);
snapshot:
blkcg->dirtied_stamp = dirtied;
blkcg->bw_time_stamp = now;
unlock:
spin_unlock(&blkcg->lock);
}
#endif
static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
{
struct bdi_writeback *wb = dtc->wb;
@ -1732,6 +1684,10 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
}
}
#ifdef CONFIG_BLK_DEV_THROTTLING_CGROUP_V1
EXPORT_TRACEPOINT_SYMBOL_GPL(blkcg_dirty_ratelimit);
#endif
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@ -1941,7 +1897,7 @@ free_running:
if (blkcg_buffered_write_bps(blkcg) &&
task_ratelimit > blkcg_dirty_ratelimit(blkcg)) {
blkcg_bps:
blkcg_update_bandwidth(blkcg);
RUE_CALL_VOID(IO, blkcg_update_bandwidth, blkcg);
dirty_ratelimit = blkcg_dirty_ratelimit(blkcg);
task_ratelimit = dirty_ratelimit;
}