From 125bfc7cd750e68c99f1d446e2c22abea08c237f Mon Sep 17 00:00:00 2001
From: Li Nan <linan122@huawei.com>
Date: Fri, 9 Jun 2023 17:43:20 +0800
Subject: [PATCH 1/5] md/raid10: fix the condition to call bio_end_io_acct()

/sys/block/[device]/queue/iostats is used to control whether to count io
stat. Write 0 to it will clear queue_flags QUEUE_FLAG_IO_STAT which means
iostats is disabled. If we disable iostats and later endable it, the io
issued during this period will be counted incorrectly, inflight will be
decreased to -1.

  //T1 set iostats
  echo 0 > /sys/block/md0/queue/iostats
   clear QUEUE_FLAG_IO_STAT

			//T2 issue io
			if (QUEUE_FLAG_IO_STAT) -> false
			 bio_start_io_acct
			  inflight++

  echo 1 > /sys/block/md0/queue/iostats
   set QUEUE_FLAG_IO_STAT

					//T3 io end
					if (QUEUE_FLAG_IO_STAT) -> true
					 bio_end_io_acct
					  inflight--	-> -1

Also, if iostats is enabled while issuing io but disabled while io end,
inflight will never be decreased.

Fix it by checking start_time when io end. If start_time is not 0, call
bio_end_io_acct().

Fixes: 528bc2cf2fcc ("md/raid10: enable io accounting")
Signed-off-by: Li Nan <linan122@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230609094320.2397604-1-linan666@huaweicloud.com
---
 drivers/md/raid10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index d0de8c9fb3cf..79067769e44b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -325,7 +325,7 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		bio->bi_status = BLK_STS_IOERR;
 
-	if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
+	if (r10_bio->start_time)
 		bio_end_io_acct(bio, r10_bio->start_time);
 	bio_endio(bio);
 	/*

From b5a99602b74bbfa655be509c615181dd95b0719e Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Fri, 16 Jun 2023 09:21:36 +0800
Subject: [PATCH 2/5] md/raid1-10: fix casting from randomized structure in
 raid1_submit_write()

Following build error triggered while build with clang version 17.0.0
with W=1(this can't be reporduced with gcc 13.1.0):

drivers/md/raid1-10.c:117:25: error: casting from randomized structure
pointer type 'struct block_device *' to 'struct md_rdev *'
     117 |         struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
         |                                ^

Fix this by casting 'bio->bi_bdev' to 'void *', as it used to be.

Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202306142042.fmjfmTF8-lkp@intel.com/
Fixes: 8295efbe68c0 ("md/raid1-10: factor out a helper to submit normal write")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230616012136.3047071-1-yukuai1@huaweicloud.com
---
 drivers/md/raid1-10.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 169ebe296f2d..3f22edec70e7 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -116,7 +116,7 @@ static void md_bio_reset_resync_pages(struct bio *bio, struct resync_pages *rp,
 
 static inline void raid1_submit_write(struct bio *bio)
 {
-	struct md_rdev *rdev = (struct md_rdev *)bio->bi_bdev;
+	struct md_rdev *rdev = (void *)bio->bi_bdev;
 
 	bio->bi_next = NULL;
 	bio_set_dev(bio, rdev->bdev);

From a1d7671910965ca9f8f0377e7e3bfd1179fba4d8 Mon Sep 17 00:00:00 2001
From: Song Liu <song@kernel.org>
Date: Fri, 16 Jun 2023 22:24:04 -0700
Subject: [PATCH 3/5] md: use mddev->external to select holder in export_rdev()

mdadm test "10ddf-create-fail-rebuild" triggers warnings like the following

[  215.526357] ------------[ cut here ]------------
[  215.527243] WARNING: CPU: 18 PID: 1264 at block/bdev.c:617 blkdev_put+0x269/0x350
[  215.528334] Modules linked in:
[  215.528806] CPU: 18 PID: 1264 Comm: mdmon Not tainted 6.4.0-rc2+ #768
[  215.529863] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
[  215.531464] RIP: 0010:blkdev_put+0x269/0x350
[  215.532167] Code: ff ff 49 8d 7d 10 e8 56 bf b8 ff 4d 8b 65 10 49 8d bc
24 58 05 00 00 e8 05 be b8 ff 41 83 ac 24 58 05 00 00 01 e9 44 ff ff ff
<0f> 0b e9 52 fe ff ff 0f 0b e9 6b fe ff ff1
[  215.534780] RSP: 0018:ffffc900040bfbf0 EFLAGS: 00010283
[  215.535635] RAX: ffff888174001000 RBX: ffff88810b1c3b00 RCX: ffffffff819a4061
[  215.536645] RDX: dffffc0000000000 RSI: dffffc0000000000 RDI: ffff88810b1c3ba0
[  215.537657] RBP: ffff88810dbde800 R08: fffffbfff0fca983 R09: fffffbfff0fca983
[  215.538674] R10: ffffc900040bfbf0 R11: fffffbfff0fca982 R12: ffff88810b1c3b38
[  215.539687] R13: ffff88810b1c3b10 R14: ffff88810dbdecb8 R15: ffff88810b1c3b00
[  215.540833] FS:  00007f2aabdff700(0000) GS:ffff888dfb400000(0000) knlGS:0000000000000000
[  215.541961] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  215.542775] CR2: 00007fa19a85d934 CR3: 000000010c076006 CR4: 0000000000370ee0
[  215.543814] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  215.544840] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  215.545885] Call Trace:
[  215.546257]  <TASK>
[  215.546608]  export_rdev.isra.63+0x71/0xe0
[  215.547338]  mddev_unlock+0x1b1/0x2d0
[  215.547898]  array_state_store+0x28d/0x450
[  215.548519]  md_attr_store+0xd7/0x150
[  215.549059]  ? __pfx_sysfs_kf_write+0x10/0x10
[  215.549702]  kernfs_fop_write_iter+0x1b9/0x260
[  215.550351]  vfs_write+0x491/0x760
[  215.550863]  ? __pfx_vfs_write+0x10/0x10
[  215.551445]  ? __fget_files+0x156/0x230
[  215.552053]  ksys_write+0xc0/0x160
[  215.552570]  ? __pfx_ksys_write+0x10/0x10
[  215.553141]  ? ktime_get_coarse_real_ts64+0xec/0x100
[  215.553878]  do_syscall_64+0x3a/0x90
[  215.554403]  entry_SYSCALL_64_after_hwframe+0x72/0xdc
[  215.555125] RIP: 0033:0x7f2aade11847
[  215.555696] Code: c3 66 90 41 54 49 89 d4 55 48 89 f5 53 89 fb 48 83 ec
10 e8 1b fd ff ff 4c 89 e2 48 89 ee 89 df 41 89 c0 b8 01 00 00 00 0f 05
<48> 3d 00 f0 ff ff 77 35 44 89 c7 48 89 448
[  215.558398] RSP: 002b:00007f2aabdfeba0 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
[  215.559516] RAX: ffffffffffffffda RBX: 0000000000000010 RCX: 00007f2aade11847
[  215.560515] RDX: 0000000000000005 RSI: 0000000000438b8b RDI: 0000000000000010
[  215.561512] RBP: 0000000000438b8b R08: 0000000000000000 R09: 00007f2aaecf0060
[  215.562511] R10: 000000000e3ba40b R11: 0000000000000293 R12: 0000000000000005
[  215.563647] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000c70750
[  215.564693]  </TASK>
[  215.565029] irq event stamp: 15979
[  215.565584] hardirqs last  enabled at (15991): [<ffffffff811a7432>] __up_console_sem+0x52/0x60
[  215.566806] hardirqs last disabled at (16000): [<ffffffff811a7417>] __up_console_sem+0x37/0x60
[  215.568022] softirqs last  enabled at (15716): [<ffffffff8277a2db>] __do_softirq+0x3eb/0x531
[  215.569239] softirqs last disabled at (15711): [<ffffffff810d8f45>] irq_exit_rcu+0x115/0x160
[  215.570434] ---[ end trace 0000000000000000 ]---

This means export_rdev() calls blkdev_put with a different holder than the
one used by blkdev_get_by_dev(). This is because mddev->major_version == -2
is not a good check for external metadata. Fix this by using
mddev->external instead.

Also, do not clear mddev->external in md_clean(), as the flag might be used
later in export_rdev().

Fixes: 2736e8eeb0cc ("block: use the holder as indication for exclusive opens")
Cc: Christoph Hellwig <hch@lst.de>
Cc: Jens Axboe <axboe@kernel.dk>
Signed-off-by: Song Liu <song@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/20230617052405.305871-1-song@kernel.org
---
 drivers/md/md.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index cf3733c90c47..8e7cc2e69bc9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2458,7 +2458,7 @@ static void export_rdev(struct md_rdev *rdev, struct mddev *mddev)
 	if (test_bit(AutoDetected, &rdev->flags))
 		md_autodetect_dev(rdev->bdev->bd_dev);
 #endif
-	blkdev_put(rdev->bdev, mddev->major_version == -2 ? &claim_rdev : rdev);
+	blkdev_put(rdev->bdev, mddev->external ? &claim_rdev : rdev);
 	rdev->bdev = NULL;
 	kobject_put(&rdev->kobj);
 }
@@ -6140,7 +6140,7 @@ static void md_clean(struct mddev *mddev)
 	mddev->resync_min = 0;
 	mddev->resync_max = MaxSector;
 	mddev->reshape_position = MaxSector;
-	mddev->external = 0;
+	/* we still need mddev->external in export_rdev, do not clear it yet */
 	mddev->persistent = 0;
 	mddev->level = LEVEL_NONE;
 	mddev->clevel[0] = 0;

From 4934b6401a812f9fe368e7d2d091cd1d120ea262 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 21 Jun 2023 22:29:33 +0800
Subject: [PATCH 4/5] md: fix 'delete_mutex' deadlock

Commit 3ce94ce5d05a ("md: fix duplicate filename for rdev") introduce a
new lock 'delete_mutex', and trigger a new deadlock:

t1: remove rdev			t2: sysfs writer

rdev_attr_store			rdev_attr_store
 mddev_lock
 state_store
 md_kick_rdev_from_array
  lock delete_mutex
  list_add mddev->deleting
  unlock delete_mutex
 mddev_unlock
				 mddev_lock
				 ...
  lock delete_mutex
  kobject_del
  // wait for sysfs writers to be done
				 mddev_unlock
				 lock delete_mutex
				 // wait for delete_mutex, deadlock

'delete_mutex' is used to protect the list 'mddev->deleting', turns out
that this list can be protected by 'reconfig_mutex' directly, and this
lock can be removed.

Fix this problem by removing the lock, and use 'reconfig_mutex' to
protect the list. mddev_unlock() will move this list to a local list to
be handled after 'reconfig_mutex' is dropped.

Fixes: 3ce94ce5d05a ("md: fix duplicate filename for rdev")
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621142933.1395629-1-yukuai1@huaweicloud.com
---
 drivers/md/md.c | 28 +++++++++-------------------
 drivers/md/md.h |  4 +---
 2 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 8e7cc2e69bc9..2e38ef421d69 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -643,7 +643,6 @@ void mddev_init(struct mddev *mddev)
 {
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
-	mutex_init(&mddev->delete_mutex);
 	mutex_init(&mddev->bitmap_info.mutex);
 	INIT_LIST_HEAD(&mddev->disks);
 	INIT_LIST_HEAD(&mddev->all_mddevs);
@@ -749,26 +748,15 @@ static void mddev_free(struct mddev *mddev)
 
 static const struct attribute_group md_redundancy_group;
 
-static void md_free_rdev(struct mddev *mddev)
+void mddev_unlock(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
 	struct md_rdev *tmp;
+	LIST_HEAD(delete);
 
-	mutex_lock(&mddev->delete_mutex);
-	if (list_empty(&mddev->deleting))
-		goto out;
+	if (!list_empty(&mddev->deleting))
+		list_splice_init(&mddev->deleting, &delete);
 
-	list_for_each_entry_safe(rdev, tmp, &mddev->deleting, same_set) {
-		list_del_init(&rdev->same_set);
-		kobject_del(&rdev->kobj);
-		export_rdev(rdev, mddev);
-	}
-out:
-	mutex_unlock(&mddev->delete_mutex);
-}
-
-void mddev_unlock(struct mddev *mddev)
-{
 	if (mddev->to_remove) {
 		/* These cannot be removed under reconfig_mutex as
 		 * an access to the files will try to take reconfig_mutex
@@ -808,7 +796,11 @@ void mddev_unlock(struct mddev *mddev)
 	} else
 		mutex_unlock(&mddev->reconfig_mutex);
 
-	md_free_rdev(mddev);
+	list_for_each_entry_safe(rdev, tmp, &delete, same_set) {
+		list_del_init(&rdev->same_set);
+		kobject_del(&rdev->kobj);
+		export_rdev(rdev, mddev);
+	}
 
 	md_wakeup_thread(mddev->thread);
 	wake_up(&mddev->sb_wait);
@@ -2488,9 +2480,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
 	 * reconfig_mutex is held, hence it can't be called under
 	 * reconfig_mutex and it's delayed to mddev_unlock().
 	 */
-	mutex_lock(&mddev->delete_mutex);
 	list_add(&rdev->same_set, &mddev->deleting);
-	mutex_unlock(&mddev->delete_mutex);
 }
 
 static void export_array(struct mddev *mddev)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bfd2306bc750..1aef86bf3fc3 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -531,11 +531,9 @@ struct mddev {
 
 	/*
 	 * Temporarily store rdev that will be finally removed when
-	 * reconfig_mutex is unlocked.
+	 * reconfig_mutex is unlocked, protected by reconfig_mutex.
 	 */
 	struct list_head		deleting;
-	/* Protect the deleting list */
-	struct mutex			delete_mutex;
 
 	bool	has_superblocks:1;
 	bool	fail_last_dev:1;

From a8d5fdd4d2702d0c7ec125bd3bbce3fc589afa67 Mon Sep 17 00:00:00 2001
From: Yu Kuai <yukuai3@huawei.com>
Date: Wed, 21 Jun 2023 18:57:28 +0800
Subject: [PATCH 5/5] raid10: avoid spin_lock from fastpath from
 raid10_unplug()

Commit 0c0be98bbe67 ("md/raid10: prevent unnecessary calls to wake_up()
in fast path") missed one place, for example, with:

	fio -direct=1 -rw=write/randwrite -iodepth=1 ...

Plug and unplug are called for each io, then wake_up() from raid10_unplug()
will cause lock contention as well.

Avoid this contention by using wake_up_barrier() instead of wake_up(),
where spin_lock is not held if waitqueue is empty.

Fio test script:

[global]
name=random reads and writes
ioengine=libaio
direct=1
readwrite=randrw
rwmixread=70
iodepth=64
buffered=0
filename=/dev/md0
size=1G
runtime=30
time_based
randrepeat=0
norandommap
refill_buffers
ramp_time=10
bs=4k
numjobs=400
group_reporting=1
[job1]

Test result with ramdisk raid10(By Ali):

	Before this patch	With this patch
READ	IOPS=2033k		IOPS=3642k
WRITE	IOPS=871k		IOPS=1561K

By the way, in this scenario, blk_plug_cb() will be allocated and freed
for each io, this seems need to be optimized as well.

Reported-and-tested-by: Ali Gholami Rudi <aligrudi@gmail.com>
Closes: https://lore.kernel.org/all/20231606122233@laper.mirepesht/
Signed-off-by: Yu Kuai <yukuai3@huawei.com>
Signed-off-by: Song Liu <song@kernel.org>
Link: https://lore.kernel.org/r/20230621105728.1268542-1-yukuai1@huaweicloud.com
---
 drivers/md/raid10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 79067769e44b..5051149e27bb 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1118,7 +1118,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		spin_unlock_irq(&conf->device_lock);
-		wake_up(&conf->wait_barrier);
+		wake_up_barrier(conf);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
@@ -1127,7 +1127,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	/* we aren't scheduling, so we can do the write-out directly. */
 	bio = bio_list_get(&plug->pending);
 	raid1_prepare_flush_writes(mddev->bitmap);
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;