From ad3fc798800fb7ca04c1dfc439dba946818048d8 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:16 +0800 Subject: [PATCH 01/12] md: revert io stats accounting The commit 41d2d848e5c0 ("md: improve io stats accounting") could cause double fault problem per the report [1], and also it is not correct to change ->bi_end_io if md don't own it, so let's revert it. And io stats accounting will be replemented in later commits. [1]. https://lore.kernel.org/linux-raid/3bf04253-3fad-434a-63a7-20214e38cf26@gmail.com/T/#t Fixes: 41d2d848e5c0 ("md: improve io stats accounting") Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 45 --------------------------------------------- drivers/md/md.h | 1 - 2 files changed, 46 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 49f897fbb89b..7ba00e4c862d 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -441,30 +441,6 @@ check_suspended: } EXPORT_SYMBOL(md_handle_request); -struct md_io { - struct mddev *mddev; - bio_end_io_t *orig_bi_end_io; - void *orig_bi_private; - struct block_device *orig_bi_bdev; - unsigned long start_time; -}; - -static void md_end_io(struct bio *bio) -{ - struct md_io *md_io = bio->bi_private; - struct mddev *mddev = md_io->mddev; - - bio_end_io_acct_remapped(bio, md_io->start_time, md_io->orig_bi_bdev); - - bio->bi_end_io = md_io->orig_bi_end_io; - bio->bi_private = md_io->orig_bi_private; - - mempool_free(md_io, &mddev->md_io_pool); - - if (bio->bi_end_io) - bio->bi_end_io(bio); -} - static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); @@ -489,21 +465,6 @@ static blk_qc_t md_submit_bio(struct bio *bio) return BLK_QC_T_NONE; } - if (bio->bi_end_io != md_end_io) { - struct md_io *md_io; - - md_io = mempool_alloc(&mddev->md_io_pool, GFP_NOIO); - md_io->mddev = mddev; - md_io->orig_bi_end_io = bio->bi_end_io; - md_io->orig_bi_private = bio->bi_private; - md_io->orig_bi_bdev = bio->bi_bdev; - - bio->bi_end_io = md_end_io; - bio->bi_private = md_io; - - md_io->start_time = bio_start_io_acct(bio); - } - /* bio could be mergeable after passing to underlayer */ bio->bi_opf &= ~REQ_NOMERGE; @@ -5608,7 +5569,6 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - mempool_exit(&mddev->md_io_pool); kfree(mddev); } @@ -5705,11 +5665,6 @@ static int md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - error = mempool_init_kmalloc_pool(&mddev->md_io_pool, BIO_POOL_SIZE, - sizeof(struct md_io)); - if (error) - goto abort; - error = -ENOMEM; mddev->queue = blk_alloc_queue(NUMA_NO_NODE); if (!mddev->queue) diff --git a/drivers/md/md.h b/drivers/md/md.h index fb7eab58cfd5..4da240ffe2c5 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -487,7 +487,6 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - mempool_t md_io_pool; /* Generic flush handling. * The last to finish preflush schedules a worker to submit From 10764815ff4728d2c57da677cd5d3dd6f446cf5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:17 +0800 Subject: [PATCH 02/12] md: add io accounting for raid0 and raid5 We introduce a new bioset (io_acct_set) for raid0 and raid5 since they don't own clone infrastructure to accounting io. And the bioset is added to mddev instead of to raid0 and raid5 layer, because with this way, we can put common functions to md.h and reuse them in raid0 and raid5. Also struct md_io_acct is added accordingly which includes io start_time, the origin bio and cloned bio. Then we can call bio_{start,end}_io_acct to get related io status. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 51 +++++++++++++++++++++++++++++++++++++++++++--- drivers/md/md.h | 8 ++++++++ drivers/md/raid0.c | 3 +++ drivers/md/raid5.c | 9 ++++++++ 4 files changed, 68 insertions(+), 3 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7ba00e4c862d..843e13666e3f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2340,7 +2340,8 @@ int md_integrity_register(struct mddev *mddev) bdev_get_integrity(reference->bdev)); pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); - if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) { + if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || + bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE)) { pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; @@ -5569,6 +5570,7 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_acct_set); kfree(mddev); } @@ -5862,7 +5864,13 @@ int md_run(struct mddev *mddev) if (!bioset_initialized(&mddev->sync_set)) { err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (err) - return err; + goto exit_bio_set; + } + if (!bioset_initialized(&mddev->io_acct_set)) { + err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, + offsetof(struct md_io_acct, bio_clone), 0); + if (err) + goto exit_sync_set; } spin_lock(&pers_lock); @@ -5990,6 +5998,7 @@ int md_run(struct mddev *mddev) blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); else blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); } if (pers->sync_request) { if (mddev->kobj.sd && @@ -6039,8 +6048,11 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->io_acct_set); +exit_sync_set: bioset_exit(&mddev->sync_set); +exit_bio_set: + bioset_exit(&mddev->bio_set); return err; } EXPORT_SYMBOL_GPL(md_run); @@ -6264,6 +6276,7 @@ void md_stop(struct mddev *mddev) __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); + bioset_exit(&mddev->io_acct_set); } EXPORT_SYMBOL_GPL(md_stop); @@ -8568,6 +8581,38 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); +static void md_end_io_acct(struct bio *bio) +{ + struct md_io_acct *md_io_acct = bio->bi_private; + struct bio *orig_bio = md_io_acct->orig_bio; + + orig_bio->bi_status = bio->bi_status; + + bio_end_io_acct(orig_bio, md_io_acct->start_time); + bio_put(bio); + bio_endio(orig_bio); +} + +/* used by personalities (raid0 and raid5) to account io stats */ +void md_account_bio(struct mddev *mddev, struct bio **bio) +{ + struct md_io_acct *md_io_acct; + struct bio *clone; + + if (!blk_queue_io_stat((*bio)->bi_bdev->bd_disk->queue)) + return; + + clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(clone, struct md_io_acct, bio_clone); + md_io_acct->orig_bio = *bio; + md_io_acct->start_time = bio_start_io_acct(*bio); + + clone->bi_end_io = md_end_io_acct; + clone->bi_private = md_io_acct; + *bio = clone; +} +EXPORT_SYMBOL_GPL(md_account_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before diff --git a/drivers/md/md.h b/drivers/md/md.h index 4da240ffe2c5..4191f22acce4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -487,6 +487,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ + struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -683,6 +684,12 @@ struct md_thread { void *private; }; +struct md_io_acct { + struct bio *orig_bio; + unsigned long start_time; + struct bio bio_clone; +}; + #define THREAD_WAKEUP 0 static inline void safe_put_page(struct page *p) @@ -714,6 +721,7 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); +void md_account_bio(struct mddev *mddev, struct bio **bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index e5d7411cba9b..62c8b6adac70 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -546,6 +546,9 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + if (bio->bi_pool != &mddev->bio_set) + md_account_bio(mddev, &bio); + orig_sector = sector; zone = find_zone(mddev->private, §or); switch (conf->layout) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 841e1c1aa5e6..58e9dbc0f683 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5468,6 +5468,7 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); + struct r5conf *conf = mddev->private; if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; @@ -5477,6 +5478,9 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) raid_bio = split; } + if (raid_bio->bi_pool != &conf->bio_split) + md_account_bio(mddev, &raid_bio); + if (!raid5_read_one_chunk(mddev, raid_bio)) return raid_bio; @@ -5756,6 +5760,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; bool do_flush = false; + bool do_clone = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5784,6 +5789,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == READ && mddev->degraded == 0 && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); + do_clone = true; if (!bi) return true; } @@ -5798,6 +5804,9 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL; + if (!do_clone) + md_account_bio(mddev, &bi); + prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; From c82aa1b76787c34fd02374e519b6f52cdeb2f54b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:18 +0800 Subject: [PATCH 03/12] md/raid5: move checking badblock before clone bio in raid5_read_one_chunk We don't need to clone bio if the relevant region has badblock. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 58e9dbc0f683..5a05277f4be7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5427,6 +5427,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); rcu_read_unlock(); + if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, + &bad_sectors)) { + bio_put(raid_bio); + rdev_dec_pending(rdev, mddev); + return 0; + } + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); bio_set_dev(align_bio, rdev->bdev); align_bio->bi_end_io = raid5_align_endio; @@ -5435,13 +5442,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) raid_bio->bi_next = (void *)rdev; - if (is_badblock(rdev, sector, bio_sectors(align_bio), &first_bad, - &bad_sectors)) { - bio_put(align_bio); - rdev_dec_pending(rdev, mddev); - return 0; - } - /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; From 1147f58e1010b8688bac1fd3bbab753b1379291d Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:19 +0800 Subject: [PATCH 04/12] md/raid5: avoid redundant bio clone in raid5_read_one_chunk After enable io accounting, chunk read bio could be cloned twice which is not good. To avoid such inefficiency, let's clone align_bio from io_acct_set too, then we need only call md_account_bio in make_request unconditionally. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5a05277f4be7..f83623ac8c34 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5364,11 +5364,13 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct bio* raid_bi = bi->bi_private; + struct md_io_acct *md_io_acct = bi->bi_private; + struct bio *raid_bi = md_io_acct->orig_bio; struct mddev *mddev; struct r5conf *conf; struct md_rdev *rdev; blk_status_t error = bi->bi_status; + unsigned long start_time = md_io_acct->start_time; bio_put(bi); @@ -5380,6 +5382,8 @@ static void raid5_align_endio(struct bio *bi) rdev_dec_pending(rdev, conf->mddev); if (!error) { + if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue)) + bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5398,6 +5402,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; + struct md_io_acct *md_io_acct; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); @@ -5434,14 +5439,18 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } - align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->bio_set); + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); + md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + raid_bio->bi_next = (void *)rdev; + if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue)) + md_io_acct->start_time = bio_start_io_acct(raid_bio); + md_io_acct->orig_bio = raid_bio; + bio_set_dev(align_bio, rdev->bdev); align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = raid_bio; + align_bio->bi_private = md_io_acct; align_bio->bi_iter.bi_sector = sector; - raid_bio->bi_next = (void *)rdev; - /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; @@ -5468,7 +5477,6 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) sector_t sector = raid_bio->bi_iter.bi_sector; unsigned chunk_sects = mddev->chunk_sectors; unsigned sectors = chunk_sects - (sector & (chunk_sects-1)); - struct r5conf *conf = mddev->private; if (sectors < bio_sectors(raid_bio)) { struct r5conf *conf = mddev->private; @@ -5478,9 +5486,6 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio) raid_bio = split; } - if (raid_bio->bi_pool != &conf->bio_split) - md_account_bio(mddev, &raid_bio); - if (!raid5_read_one_chunk(mddev, raid_bio)) return raid_bio; @@ -5760,7 +5765,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) DEFINE_WAIT(w); bool do_prepare; bool do_flush = false; - bool do_clone = false; if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret = log_handle_flush_request(conf, bi); @@ -5789,7 +5793,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == READ && mddev->degraded == 0 && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); - do_clone = true; if (!bi) return true; } @@ -5804,9 +5807,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) last_sector = bio_end_sector(bi); bi->bi_next = NULL; - if (!do_clone) - md_account_bio(mddev, &bi); - + md_account_bio(mddev, &bi); prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; From 9b8ae7b938235229ccb112c4e887ff1bcc232836 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:20 +0800 Subject: [PATCH 05/12] md/raid1: rename print_msg with r1bio_existed The caller of raid1_read_request could pass NULL or a valid pointer for "struct r1bio *r1_bio", so it actually means whether r1_bio is existed or not. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index ced076ba560e..696da6b8b7ed 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1210,7 +1210,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); int max_sectors; int rdisk; - bool print_msg = !!r1_bio; + bool r1bio_existed = !!r1_bio; char b[BDEVNAME_SIZE]; /* @@ -1220,7 +1220,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, */ gfp_t gfp = r1_bio ? (GFP_NOIO | __GFP_HIGH) : GFP_NOIO; - if (print_msg) { + if (r1bio_existed) { /* Need to get the block device name carefully */ struct md_rdev *rdev; rcu_read_lock(); @@ -1252,7 +1252,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, if (rdisk < 0) { /* couldn't find anywhere to read from */ - if (print_msg) { + if (r1bio_existed) { pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n", mdname(mddev), b, @@ -1263,7 +1263,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } mirror = conf->mirrors + rdisk; - if (print_msg) + if (r1bio_existed) pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n", mdname(mddev), (unsigned long long)r1_bio->sector, From a0159832e51e3af03b89ecc5d6b9db451e529b5f Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:21 +0800 Subject: [PATCH 06/12] md/raid1: enable io accounting For raid1, we record the start time between split bio and clone bio, and finish the accounting in the final endio. Also introduce start_time in r1bio accordingly. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid1.c | 7 +++++++ drivers/md/raid1.h | 1 + 2 files changed, 8 insertions(+) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 696da6b8b7ed..51f2547c2007 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -300,6 +300,8 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); } @@ -1292,6 +1294,9 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, r1_bio->read_disk = rdisk; + if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); + read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r1_bio->bios[rdisk] = read_bio; @@ -1461,6 +1466,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; } + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r1_bio->start_time = bio_start_io_acct(bio); atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index b7eb09e8c025..ccf10e59b116 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -158,6 +158,7 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; + unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx From 528bc2cf2fccef2c2c17263f9932094bf81fee5a Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:22 +0800 Subject: [PATCH 07/12] md/raid10: enable io accounting For raid10, we record the start time between split bio and clone bio, and finish the accounting in the final endio. Also introduce start_time in r10bio accordingly. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid10.c | 6 ++++++ drivers/md/raid10.h | 1 + 2 files changed, 7 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 13f5e6b2a73d..16977e8e075d 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -297,6 +297,8 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -1184,6 +1186,8 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1483,6 +1487,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } + if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue)) + r10_bio->start_time = bio_start_io_acct(bio); atomic_set(&r10_bio->remaining, 1); md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1461fd55311b..c34bb196790e 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -124,6 +124,7 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; + unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx From 608f52e30aae7dc8da836e5b7b112d50a2d00e43 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 25 May 2021 17:46:23 +0800 Subject: [PATCH 08/12] md: mark some personalities as deprecated Mark the three personalities (linear, fault and multipath) as deprecated because: 1. people can use dm multipath or nvme multipath. 2. linear is already deprecated in MODULE_ALIAS. 3. no one actively using fault. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/Kconfig | 6 +++--- drivers/md/md-faulty.c | 2 +- drivers/md/md-linear.c | 2 +- drivers/md/md-multipath.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index f2014385d48b..0602e82a9516 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -47,7 +47,7 @@ config MD_AUTODETECT If unsure, say Y. config MD_LINEAR - tristate "Linear (append) mode" + tristate "Linear (append) mode (deprecated)" depends on BLK_DEV_MD help If you say Y here, then your multiple devices driver will be able to @@ -158,7 +158,7 @@ config MD_RAID456 If unsure, say Y. config MD_MULTIPATH - tristate "Multipath I/O support" + tristate "Multipath I/O support (deprecated)" depends on BLK_DEV_MD help MD_MULTIPATH provides a simple multi-path personality for use @@ -169,7 +169,7 @@ config MD_MULTIPATH If unsure, say N. config MD_FAULTY - tristate "Faulty test module for MD" + tristate "Faulty test module for MD (deprecated)" depends on BLK_DEV_MD help The "faulty" module allows for a block device that occasionally returns diff --git a/drivers/md/md-faulty.c b/drivers/md/md-faulty.c index fda4cb3f936f..c0dc6f2ef4a3 100644 --- a/drivers/md/md-faulty.c +++ b/drivers/md/md-faulty.c @@ -357,7 +357,7 @@ static void raid_exit(void) module_init(raid_init); module_exit(raid_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Fault injection personality for MD"); +MODULE_DESCRIPTION("Fault injection personality for MD (deprecated)"); MODULE_ALIAS("md-personality-10"); /* faulty */ MODULE_ALIAS("md-faulty"); MODULE_ALIAS("md-level--5"); diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 63ed8329a98d..1ff51647a682 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -312,7 +312,7 @@ static void linear_exit (void) module_init(linear_init); module_exit(linear_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Linear device concatenation personality for MD"); +MODULE_DESCRIPTION("Linear device concatenation personality for MD (deprecated)"); MODULE_ALIAS("md-personality-1"); /* LINEAR - deprecated*/ MODULE_ALIAS("md-linear"); MODULE_ALIAS("md-level--1"); diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 776bbe542db5..e7d6486f090f 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -471,7 +471,7 @@ static void __exit multipath_exit (void) module_init(multipath_init); module_exit(multipath_exit); MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("simple multi-path personality for MD"); +MODULE_DESCRIPTION("simple multi-path personality for MD (deprecated)"); MODULE_ALIAS("md-personality-7"); /* MULTIPATH */ MODULE_ALIAS("md-multipath"); MODULE_ALIAS("md-level--4"); From c32dc04059c79ddb4f7cff94ad5de6e92ea2218d Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sat, 29 May 2021 12:30:49 +0200 Subject: [PATCH 09/12] md: Constify attribute_group structs The attribute_group structs are never modified, they're only passed to sysfs_create_group() and sysfs_remove_group(). Make them const to allow the compiler to put them in read-only memory. Signed-off-by: Rikard Falkeborn Signed-off-by: Song Liu --- drivers/md/md-bitmap.c | 2 +- drivers/md/md.c | 6 +++--- drivers/md/md.h | 4 ++-- drivers/md/raid5.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index ea3130e11680..e29c6298ef5c 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2616,7 +2616,7 @@ static struct attribute *md_bitmap_attrs[] = { &max_backlog_used.attr, NULL }; -struct attribute_group md_bitmap_group = { +const struct attribute_group md_bitmap_group = { .name = "bitmap", .attrs = md_bitmap_attrs, }; diff --git a/drivers/md/md.c b/drivers/md/md.c index 843e13666e3f..32abcfb8bcad 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -785,7 +785,7 @@ out_free_new: return ERR_PTR(error); } -static struct attribute_group md_redundancy_group; +static const struct attribute_group md_redundancy_group; void mddev_unlock(struct mddev *mddev) { @@ -802,7 +802,7 @@ void mddev_unlock(struct mddev *mddev) * test it under the same mutex to ensure its correct value * is seen. */ - struct attribute_group *to_remove = mddev->to_remove; + const struct attribute_group *to_remove = mddev->to_remove; mddev->to_remove = NULL; mddev->sysfs_active = 1; mutex_unlock(&mddev->reconfig_mutex); @@ -5500,7 +5500,7 @@ static struct attribute *md_redundancy_attrs[] = { &md_degraded.attr, NULL, }; -static struct attribute_group md_redundancy_group = { +static const struct attribute_group md_redundancy_group = { .name = NULL, .attrs = md_redundancy_attrs, }; diff --git a/drivers/md/md.h b/drivers/md/md.h index 4191f22acce4..b9b7d2f992f3 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -481,7 +481,7 @@ struct mddev { atomic_t max_corr_read_errors; /* max read retries */ struct list_head all_mddevs; - struct attribute_group *to_remove; + const struct attribute_group *to_remove; struct bio_set bio_set; struct bio_set sync_set; /* for sync operations like @@ -613,7 +613,7 @@ struct md_sysfs_entry { ssize_t (*show)(struct mddev *, char *); ssize_t (*store)(struct mddev *, const char *, size_t); }; -extern struct attribute_group md_bitmap_group; +extern const struct attribute_group md_bitmap_group; static inline struct kernfs_node *sysfs_get_dirent_safe(struct kernfs_node *sd, char *name) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f83623ac8c34..0ee9aa0113f3 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6940,7 +6940,7 @@ static struct attribute *raid5_attrs[] = { &ppl_write_hint.attr, NULL, }; -static struct attribute_group raid5_attrs_group = { +static const struct attribute_group raid5_attrs_group = { .name = NULL, .attrs = raid5_attrs, }; From daee2024715ddf430a069c0c4eab8417146934cf Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 3 Jun 2021 17:21:06 +0800 Subject: [PATCH 10/12] md: check level before create and exit io_acct_set The bio_set (io_acct_set) is used by personalities to clone bio and trace the timestamp of bio. Some personalities such as raid1/10 don't need the bio_set, so add check to not create it unconditionally. Also update the comment for md_account_bio to make it more clear. Suggested-by: Christoph Hellwig Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 32abcfb8bcad..56b606184c87 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2341,7 +2341,8 @@ int md_integrity_register(struct mddev *mddev) pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE)) { + (mddev->level != 1 && mddev->level != 10 && + bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; @@ -5570,7 +5571,8 @@ static void md_free(struct kobject *ko) bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_acct_set); + if (mddev->level != 1 && mddev->level != 10) + bioset_exit(&mddev->io_acct_set); kfree(mddev); } @@ -5866,7 +5868,8 @@ int md_run(struct mddev *mddev) if (err) goto exit_bio_set; } - if (!bioset_initialized(&mddev->io_acct_set)) { + if (mddev->level != 1 && mddev->level != 10 && + !bioset_initialized(&mddev->io_acct_set)) { err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, offsetof(struct md_io_acct, bio_clone), 0); if (err) @@ -6048,7 +6051,8 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->io_acct_set); + if (mddev->level != 1 && mddev->level != 10) + bioset_exit(&mddev->io_acct_set); exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: @@ -6276,7 +6280,8 @@ void md_stop(struct mddev *mddev) __md_stop(mddev); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_acct_set); + if (mddev->level != 1 && mddev->level != 10) + bioset_exit(&mddev->io_acct_set); } EXPORT_SYMBOL_GPL(md_stop); @@ -8593,7 +8598,10 @@ static void md_end_io_acct(struct bio *bio) bio_endio(orig_bio); } -/* used by personalities (raid0 and raid5) to account io stats */ +/* + * Used by personalities that don't already clone the bio and thus can't + * easily add the timestamp to their extended bio structure. + */ void md_account_bio(struct mddev *mddev, struct bio **bio) { struct md_io_acct *md_io_acct; From de3ea66e9d23a34eef5e17f960d6473f78a1c54b Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Thu, 3 Jun 2021 17:21:07 +0800 Subject: [PATCH 11/12] md: add comments in md_integrity_register Given it is not obvious for the error handling, let's try to add some comments here to make it clear. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 56b606184c87..2c69905dd5c0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2343,6 +2343,12 @@ int md_integrity_register(struct mddev *mddev) if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + /* + * No need to handle the failure of bioset_integrity_create, + * because the function is called by md_run() -> pers->run(), + * md_run calls bioset_exit -> bioset_integrity_free in case + * of failure case. + */ pr_err("md: failed to create integrity pool for %s\n", mdname(mddev)); return -EINVAL; From 97ae27252f4962d0fcc38ee1d9f913d817a2024e Mon Sep 17 00:00:00 2001 From: Gal Ofri Date: Mon, 7 Jun 2021 14:07:03 +0300 Subject: [PATCH 12/12] md/raid5: avoid device_lock in read_one_chunk() There is a lock contention on device_lock in read_one_chunk(). device_lock is taken to sync conf->active_aligned_reads and conf->quiesce. read_one_chunk() takes the lock, then waits for quiesce=0 (resumed) before incrementing active_aligned_reads. raid5_quiesce() takes the lock, sets quiesce=2 (in-progress), then waits for active_aligned_reads to be zero before setting quiesce=1 (suspended). Introduce a fast (lockless) path in read_one_chunk(): activate aligned read without taking device_lock. In case quiesce starts while activating the aligned-read in fast path, deactivate it and revert to old behavior (take device_lock and wait for quiesce to finish). Add smp store/load in raid5_quiesce()/read_one_chunk() respectively to gaurantee that read_one_chunk() does not miss an ongoing quiesce. My setups: 1. 8 local nvme drives (each up to 250k iops). 2. 8 ram disks (brd). Each setup with raid6 (6+2), 1024 io threads on a 96 cpu-cores (48 per socket) system. Record both iops and cpu spent on this contention with rand-read-4k. Record bw with sequential-read-128k. Note: in most cases cpu is still busy but due to "new" bottlenecks. nvme: | iops | cpu | bw ----------------------------------------------- without patch | 1.6M | ~50% | 5.5GB/s with patch | 2M (throttled) | 0% | 16GB/s (throttled) ram (brd): | iops | cpu | bw ----------------------------------------------- without patch | 2M | ~80% | 24GB/s with patch | 4M | 0% | 55GB/s CC: Song Liu CC: Neil Brown Reviewed-by: NeilBrown Signed-off-by: Gal Ofri Signed-off-by: Song Liu --- drivers/md/raid5.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0ee9aa0113f3..e248532bb70a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5403,6 +5403,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; struct md_io_acct *md_io_acct; + bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { pr_debug("%s: non aligned\n", __func__); @@ -5454,11 +5455,24 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; - spin_lock_irq(&conf->device_lock); - wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, - conf->device_lock); - atomic_inc(&conf->active_aligned_reads); - spin_unlock_irq(&conf->device_lock); + did_inc = false; + if (conf->quiesce == 0) { + atomic_inc(&conf->active_aligned_reads); + did_inc = true; + } + /* need a memory barrier to detect the race with raid5_quiesce() */ + if (!did_inc || smp_load_acquire(&conf->quiesce) != 0) { + /* quiesce is in progress, so we need to undo io activation and wait + * for it to finish + */ + if (did_inc && atomic_dec_and_test(&conf->active_aligned_reads)) + wake_up(&conf->wait_for_quiescent); + spin_lock_irq(&conf->device_lock); + wait_event_lock_irq(conf->wait_for_quiescent, conf->quiesce == 0, + conf->device_lock); + atomic_inc(&conf->active_aligned_reads); + spin_unlock_irq(&conf->device_lock); + } if (mddev->gendisk) trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), @@ -8346,7 +8360,10 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) * active stripes can drain */ r5c_flush_cache(conf, INT_MAX); - conf->quiesce = 2; + /* need a memory barrier to make sure read_one_chunk() sees + * quiesce started and reverts to slow (locked) path. + */ + smp_store_release(&conf->quiesce, 2); wait_event_cmd(conf->wait_for_quiescent, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0,