From 21e0958ec9684e76e32f822c5e611a7d7ea0a5ba Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sat, 4 Apr 2020 23:57:07 +0200 Subject: [PATCH 01/13] md: add checkings before flush md_misc_wq Coly reported possible circular locking dependencyi with LOCKDEP enabled, quote the below info from the detailed report [1]. [ 1607.673903] Chain exists of: [ 1607.673903] kn->count#256 --> (wq_completion)md_misc --> (work_completion)(&rdev->del_work) [ 1607.673903] [ 1607.827946] Possible unsafe locking scenario: [ 1607.827946] [ 1607.898780] CPU0 CPU1 [ 1607.952980] ---- ---- [ 1608.007173] lock((work_completion)(&rdev->del_work)); [ 1608.069690] lock((wq_completion)md_misc); [ 1608.149887] lock((work_completion)(&rdev->del_work)); [ 1608.242563] lock(kn->count#256); [ 1608.283238] [ 1608.283238] *** DEADLOCK *** [ 1608.283238] [ 1608.354078] 2 locks held by kworker/5:0/843: [ 1608.405152] #0: ffff8889eecc9948 ((wq_completion)md_misc){+.+.}, at: process_one_work+0x42b/0xb30 [ 1608.512399] #1: ffff888a1d3b7e10 ((work_completion)(&rdev->del_work)){+.+.}, at: process_one_work+0x42b/0xb30 [ 1608.632130] Since works (rdev->del_work and mddev->del_work) are queued in md_misc_wq, then lockdep_map lock is held if either of them are running, then both of them try to hold kernfs lock by call kobject_del. Then if new_dev_store or array_state_store are triggered by write to the related sysfs node, so the write operation gets kernfs lock, but need the lockdep_map because all of them would trigger flush_workqueue(md_misc_wq) finally, then the same lockdep_map lock is needed. To suppress the lockdep warnning, we should flush the workqueue in case the related work is pending. And several works are attached to md_misc_wq, so we need to check which work should be checked: 1. for __md_stop_writes, the purpose of call flush workqueue is ensure sync thread is started if it was starting, so check mddev->del_work is pending or not since md_start_sync is attached to mddev->del_work. 2. __md_stop flushes md_misc_wq to ensure event_work is done, check the event_work is enough. Assume raid_{ctr,dtr} -> md_stop -> __md_stop doesn't need the kernfs lock. 3. both new_dev_store (holds kernfs lock) and ADD_NEW_DISK ioctl (holds the bdev->bd_mutex) call flush_workqueue to ensure md_delayed_delete has completed, this case will be handled in next patch. 4. md_open flushes workqueue to ensure the previous md is disappeared, but it holds bdev->bd_mutex then try to flush workqueue, so it is better to check mddev->del_work as well to avoid potential lock issue, this will be done in another patch. [1]: https://marc.info/?l=linux-raid&m=158518958031584&w=2 Cc: Coly Li Reported-by: Coly Li Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 271e8a587354..76c32ca6eee8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6147,7 +6147,8 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); @@ -6200,7 +6201,8 @@ static void __md_stop(struct mddev *mddev) md_bitmap_destroy(mddev); mddev_detach(mddev); /* Ensure ->event_work is done */ - flush_workqueue(md_misc_wq); + if (mddev->event_work.func) + flush_workqueue(md_misc_wq); spin_lock(&mddev->lock); mddev->pers = NULL; spin_unlock(&mddev->lock); From cc1ffe61c026e2318024bfa8722e8f689fd2a7f4 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sat, 4 Apr 2020 23:57:08 +0200 Subject: [PATCH 02/13] md: add new workqueue for delete rdev Since the purpose of call flush_workqueue in new_dev_store is to ensure md_delayed_delete() has completed, so we should check rdev->del_work is pending or not. To suppress lockdep warning, we have to check mddev->del_work while md_delayed_delete is attached to rdev->del_work, so it is not aligned to the purpose of flush workquee. So a new workqueue is needed to avoid the awkward situation, and introduce a new func flush_rdev_wq to flush the new workqueue after check if there was pending work. Also like new_dev_store, ADD_NEW_DISK ioctl has the same purpose to flush workqueue while it holds bdev->bd_mutex, so make the same change applies to the ioctl to avoid similar lock issue. And md_delayed_delete actually wants to delete rdev, so rename the function to rdev_delayed_delete. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 37 +++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 76c32ca6eee8..a970fdd0ce8c 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -89,6 +89,7 @@ static struct module *md_cluster_mod; static DECLARE_WAIT_QUEUE_HEAD(resync_wait); static struct workqueue_struct *md_wq; static struct workqueue_struct *md_misc_wq; +static struct workqueue_struct *md_rdev_misc_wq; static int remove_and_add_spares(struct mddev *mddev, struct md_rdev *this); @@ -2454,7 +2455,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) return err; } -static void md_delayed_delete(struct work_struct *ws) +static void rdev_delayed_delete(struct work_struct *ws) { struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); kobject_del(&rdev->kobj); @@ -2479,9 +2480,9 @@ static void unbind_rdev_from_array(struct md_rdev *rdev) * to delay it due to rcu usage. */ synchronize_rcu(); - INIT_WORK(&rdev->del_work, md_delayed_delete); + INIT_WORK(&rdev->del_work, rdev_delayed_delete); kobject_get(&rdev->kobj); - queue_work(md_misc_wq, &rdev->del_work); + queue_work(md_rdev_misc_wq, &rdev->del_work); } /* @@ -4514,6 +4515,20 @@ null_show(struct mddev *mddev, char *page) return -EINVAL; } +/* need to ensure rdev_delayed_delete() has completed */ +static void flush_rdev_wq(struct mddev *mddev) +{ + struct md_rdev *rdev; + + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (work_pending(&rdev->del_work)) { + flush_workqueue(md_rdev_misc_wq); + break; + } + rcu_read_unlock(); +} + static ssize_t new_dev_store(struct mddev *mddev, const char *buf, size_t len) { @@ -4541,8 +4556,7 @@ new_dev_store(struct mddev *mddev, const char *buf, size_t len) minor != MINOR(dev)) return -EOVERFLOW; - flush_workqueue(md_misc_wq); - + flush_rdev_wq(mddev); err = mddev_lock(mddev); if (err) return err; @@ -4780,7 +4794,8 @@ action_store(struct mddev *mddev, const char *page, size_t len) clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && mddev_lock(mddev) == 0) { - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); if (mddev->sync_thread) { set_bit(MD_RECOVERY_INTR, &mddev->recovery); md_reap_sync_thread(mddev); @@ -7498,8 +7513,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } if (cmd == ADD_NEW_DISK) - /* need to ensure md_delayed_delete() has completed */ - flush_workqueue(md_misc_wq); + flush_rdev_wq(mddev); if (cmd == HOT_REMOVE_DISK) /* need to ensure recovery thread has run */ @@ -9471,6 +9485,10 @@ static int __init md_init(void) if (!md_misc_wq) goto err_misc_wq; + md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0); + if (!md_misc_wq) + goto err_rdev_misc_wq; + if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) goto err_md; @@ -9492,6 +9510,8 @@ static int __init md_init(void) err_mdp: unregister_blkdev(MD_MAJOR, "md"); err_md: + destroy_workqueue(md_rdev_misc_wq); +err_rdev_misc_wq: destroy_workqueue(md_misc_wq); err_misc_wq: destroy_workqueue(md_wq); @@ -9778,6 +9798,7 @@ static __exit void md_exit(void) * destroy_workqueue() below will wait for that to complete. */ } + destroy_workqueue(md_rdev_misc_wq); destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); } From f6766ff6afff70e2aaf39e1511e16d471de7c3ae Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sat, 4 Apr 2020 23:57:09 +0200 Subject: [PATCH 03/13] md: don't flush workqueue unconditionally in md_open We need to check mddev->del_work before flush workqueu since the purpose of flush is to ensure the previous md is disappeared. Otherwise the similar deadlock appeared if LOCKDEP is enabled, it is due to md_open holds the bdev->bd_mutex before flush workqueue. kernel: [ 154.522645] ====================================================== kernel: [ 154.522647] WARNING: possible circular locking dependency detected kernel: [ 154.522650] 5.6.0-rc7-lp151.27-default #25 Tainted: G O kernel: [ 154.522651] ------------------------------------------------------ kernel: [ 154.522653] mdadm/2482 is trying to acquire lock: kernel: [ 154.522655] ffff888078529128 ((wq_completion)md_misc){+.+.}, at: flush_workqueue+0x84/0x4b0 kernel: [ 154.522673] kernel: [ 154.522673] but task is already holding lock: kernel: [ 154.522675] ffff88804efa9338 (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x79/0x590 kernel: [ 154.522691] kernel: [ 154.522691] which lock already depends on the new lock. kernel: [ 154.522691] kernel: [ 154.522694] kernel: [ 154.522694] the existing dependency chain (in reverse order) is: kernel: [ 154.522696] kernel: [ 154.522696] -> #4 (&bdev->bd_mutex){+.+.}: kernel: [ 154.522704] __mutex_lock+0x87/0x950 kernel: [ 154.522706] __blkdev_get+0x79/0x590 kernel: [ 154.522708] blkdev_get+0x65/0x140 kernel: [ 154.522709] blkdev_get_by_dev+0x2f/0x40 kernel: [ 154.522716] lock_rdev+0x3d/0x90 [md_mod] kernel: [ 154.522719] md_import_device+0xd6/0x1b0 [md_mod] kernel: [ 154.522723] new_dev_store+0x15e/0x210 [md_mod] kernel: [ 154.522728] md_attr_store+0x7a/0xc0 [md_mod] kernel: [ 154.522732] kernfs_fop_write+0x117/0x1b0 kernel: [ 154.522735] vfs_write+0xad/0x1a0 kernel: [ 154.522737] ksys_write+0xa4/0xe0 kernel: [ 154.522745] do_syscall_64+0x64/0x2b0 kernel: [ 154.522748] entry_SYSCALL_64_after_hwframe+0x49/0xbe kernel: [ 154.522749] kernel: [ 154.522749] -> #3 (&mddev->reconfig_mutex){+.+.}: kernel: [ 154.522752] __mutex_lock+0x87/0x950 kernel: [ 154.522756] new_dev_store+0xc9/0x210 [md_mod] kernel: [ 154.522759] md_attr_store+0x7a/0xc0 [md_mod] kernel: [ 154.522761] kernfs_fop_write+0x117/0x1b0 kernel: [ 154.522763] vfs_write+0xad/0x1a0 kernel: [ 154.522765] ksys_write+0xa4/0xe0 kernel: [ 154.522767] do_syscall_64+0x64/0x2b0 kernel: [ 154.522769] entry_SYSCALL_64_after_hwframe+0x49/0xbe kernel: [ 154.522770] kernel: [ 154.522770] -> #2 (kn->count#253){++++}: kernel: [ 154.522775] __kernfs_remove+0x253/0x2c0 kernel: [ 154.522778] kernfs_remove+0x1f/0x30 kernel: [ 154.522780] kobject_del+0x28/0x60 kernel: [ 154.522783] mddev_delayed_delete+0x24/0x30 [md_mod] kernel: [ 154.522786] process_one_work+0x2a7/0x5f0 kernel: [ 154.522788] worker_thread+0x2d/0x3d0 kernel: [ 154.522793] kthread+0x117/0x130 kernel: [ 154.522795] ret_from_fork+0x3a/0x50 kernel: [ 154.522796] kernel: [ 154.522796] -> #1 ((work_completion)(&mddev->del_work)){+.+.}: kernel: [ 154.522800] process_one_work+0x27e/0x5f0 kernel: [ 154.522802] worker_thread+0x2d/0x3d0 kernel: [ 154.522804] kthread+0x117/0x130 kernel: [ 154.522806] ret_from_fork+0x3a/0x50 kernel: [ 154.522807] kernel: [ 154.522807] -> #0 ((wq_completion)md_misc){+.+.}: kernel: [ 154.522813] __lock_acquire+0x1392/0x1690 kernel: [ 154.522816] lock_acquire+0xb4/0x1a0 kernel: [ 154.522818] flush_workqueue+0xab/0x4b0 kernel: [ 154.522821] md_open+0xb6/0xc0 [md_mod] kernel: [ 154.522823] __blkdev_get+0xea/0x590 kernel: [ 154.522825] blkdev_get+0x65/0x140 kernel: [ 154.522828] do_dentry_open+0x1d1/0x380 kernel: [ 154.522831] path_openat+0x567/0xcc0 kernel: [ 154.522834] do_filp_open+0x9b/0x110 kernel: [ 154.522836] do_sys_openat2+0x201/0x2a0 kernel: [ 154.522838] do_sys_open+0x57/0x80 kernel: [ 154.522840] do_syscall_64+0x64/0x2b0 kernel: [ 154.522842] entry_SYSCALL_64_after_hwframe+0x49/0xbe kernel: [ 154.522844] kernel: [ 154.522844] other info that might help us debug this: kernel: [ 154.522844] kernel: [ 154.522846] Chain exists of: kernel: [ 154.522846] (wq_completion)md_misc --> &mddev->reconfig_mutex --> &bdev->bd_mutex kernel: [ 154.522846] kernel: [ 154.522850] Possible unsafe locking scenario: kernel: [ 154.522850] kernel: [ 154.522852] CPU0 CPU1 kernel: [ 154.522853] ---- ---- kernel: [ 154.522854] lock(&bdev->bd_mutex); kernel: [ 154.522856] lock(&mddev->reconfig_mutex); kernel: [ 154.522858] lock(&bdev->bd_mutex); kernel: [ 154.522860] lock((wq_completion)md_misc); kernel: [ 154.522861] kernel: [ 154.522861] *** DEADLOCK *** kernel: [ 154.522861] kernel: [ 154.522864] 1 lock held by mdadm/2482: kernel: [ 154.522865] #0: ffff88804efa9338 (&bdev->bd_mutex){+.+.}, at: __blkdev_get+0x79/0x590 kernel: [ 154.522868] kernel: [ 154.522868] stack backtrace: kernel: [ 154.522873] CPU: 1 PID: 2482 Comm: mdadm Tainted: G O 5.6.0-rc7-lp151.27-default #25 kernel: [ 154.522875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 kernel: [ 154.522878] Call Trace: kernel: [ 154.522881] dump_stack+0x8f/0xcb kernel: [ 154.522884] check_noncircular+0x194/0x1b0 kernel: [ 154.522888] ? __lock_acquire+0x1392/0x1690 kernel: [ 154.522890] __lock_acquire+0x1392/0x1690 kernel: [ 154.522893] lock_acquire+0xb4/0x1a0 kernel: [ 154.522895] ? flush_workqueue+0x84/0x4b0 kernel: [ 154.522898] flush_workqueue+0xab/0x4b0 kernel: [ 154.522900] ? flush_workqueue+0x84/0x4b0 kernel: [ 154.522905] ? md_open+0xb6/0xc0 [md_mod] kernel: [ 154.522908] md_open+0xb6/0xc0 [md_mod] kernel: [ 154.522910] __blkdev_get+0xea/0x590 kernel: [ 154.522912] ? bd_acquire+0xc0/0xc0 kernel: [ 154.522914] blkdev_get+0x65/0x140 kernel: [ 154.522916] ? bd_acquire+0xc0/0xc0 kernel: [ 154.522918] do_dentry_open+0x1d1/0x380 kernel: [ 154.522921] path_openat+0x567/0xcc0 kernel: [ 154.522923] ? __lock_acquire+0x380/0x1690 kernel: [ 154.522926] do_filp_open+0x9b/0x110 kernel: [ 154.522929] ? __alloc_fd+0xe5/0x1f0 kernel: [ 154.522935] ? kmem_cache_alloc+0x28c/0x630 kernel: [ 154.522939] ? do_sys_openat2+0x201/0x2a0 kernel: [ 154.522941] do_sys_openat2+0x201/0x2a0 kernel: [ 154.522944] do_sys_open+0x57/0x80 kernel: [ 154.522946] do_syscall_64+0x64/0x2b0 kernel: [ 154.522948] entry_SYSCALL_64_after_hwframe+0x49/0xbe kernel: [ 154.522951] RIP: 0033:0x7f98d279d9ae And md_alloc also flushed the same workqueue, but the thing is different here. Because all the paths call md_alloc don't hold bdev->bd_mutex, and the flush is necessary to avoid race condition, so leave it as it is. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index a970fdd0ce8c..f5dfa503fb6f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7768,7 +7768,8 @@ static int md_open(struct block_device *bdev, fmode_t mode) */ mddev_put(mddev); /* Wait until bdev->bd_disk is definitely gone */ - flush_workqueue(md_misc_wq); + if (work_pending(&mddev->del_work)) + flush_workqueue(md_misc_wq); /* Then retry the open from the top */ return -ERESTARTSYS; } From 78b990cf2822d1150a419c13be48e74aefa83f27 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sat, 4 Apr 2020 23:57:10 +0200 Subject: [PATCH 04/13] md: flush md_rdev_misc_wq for HOT_ADD_DISK case Since rdev->kobj is removed asynchronously, it is possible that the rdev->kobj still exists when try to add the rdev again after rdev is removed. But this path md_ioctl (HOT_ADD_DISK) -> hot_add_disk -> bind_rdev_to_array missed it. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index f5dfa503fb6f..95b72e35b355 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7512,7 +7512,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode, } - if (cmd == ADD_NEW_DISK) + if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK) flush_rdev_wq(mddev); if (cmd == HOT_REMOVE_DISK) From 3f79cc22348fe7d285bd140209394c9e381e082c Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Sat, 4 Apr 2020 23:57:11 +0200 Subject: [PATCH 05/13] md: remove the extra line for ->hot_add_disk It is not not necessary to add a newline for them since they don't exceed 80 characters, and it is not intutive to distinguish ->hot_add_disk() from hot_add_disk() too. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 95b72e35b355..8ff690ac6247 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3192,8 +3192,7 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) rdev->saved_raid_disk = -1; clear_bit(In_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags); - err = rdev->mddev->pers-> - hot_add_disk(rdev->mddev, rdev); + err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev); if (err) { rdev->raid_disk = -1; return err; @@ -9057,8 +9056,7 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->recovery_offset = 0; } - if (mddev->pers-> - hot_add_disk(mddev, rdev) == 0) { + if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; if (!test_bit(Journal, &rdev->flags)) From 78f57ef9d50a75326da73d352d7c27828495229a Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Apr 2020 22:17:20 +0800 Subject: [PATCH 06/13] md: use memalloc scope APIs in mddev_suspend()/mddev_resume() In raid5.c:resize_chunk(), scribble_alloc() is called with GFP_NOIO flag, then it is sent into kvmalloc_array() inside scribble_alloc(). The problem is kvmalloc_array() eventually calls kvmalloc_node() which does not accept non GFP_KERNEL compatible flag like GFP_NOIO, then kmalloc_node() is called indeed to allocate physically continuous pages. When system memory is under heavy pressure, and the requesting size is large, there is high probability that allocating continueous pages will fail. But simply using GFP_KERNEL flag to call kvmalloc_array() is also progblematic. In the code path where scribble_alloc() is called, the raid array is suspended, if kvmalloc_node() triggers memory reclaim I/Os and such I/Os go back to the suspend raid array, deadlock will happen. What is desired here is to allocate non-physically (a.k.a virtually) continuous pages and avoid memory reclaim I/Os. Michal Hocko suggests to use the mmealloc sceope APIs to restrict memory reclaim I/O in allocating context, specifically to call memalloc_noio_save() when suspend the raid array and to call memalloc_noio_restore() when resume the raid array. This patch adds the memalloc scope APIs in mddev_suspend() and mddev_resume(), to restrict memory reclaim I/Os during the raid array is suspended. The benifit of adding the memalloc scope API in the unified entry point mddev_suspend()/mddev_resume() is, no matter which md raid array type (personality), we are sure the deadlock by recursive memory reclaim I/O won't happen on the suspending context. Please notice that the memalloc scope APIs only take effect on the raid array suspending context, if the memory allocation is from another new created kthread after raid array suspended, the recursive memory reclaim I/Os won't be restricted. The mddev_suspend()/mddev_resume() entries are used for the critical section where the raid metadata is modifying, creating a kthread to allocate memory inside the critical section is queer and very probably being buggy. Fixes: b330e6a49dc3 ("md: convert to kvmalloc") Suggested-by: Michal Hocko Signed-off-by: Coly Li Signed-off-by: Song Liu --- drivers/md/md.c | 4 ++++ drivers/md/md.h | 1 + 2 files changed, 5 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8ff690ac6247..1b3316c79b7b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -528,11 +528,15 @@ void mddev_suspend(struct mddev *mddev) wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); del_timer_sync(&mddev->safemode_timer); + /* restrict memory reclaim I/O during raid array is suspend */ + mddev->noio_flag = memalloc_noio_save(); } EXPORT_SYMBOL_GPL(mddev_suspend); void mddev_resume(struct mddev *mddev) { + /* entred the memalloc scope from mddev_suspend() */ + memalloc_noio_restore(mddev->noio_flag); lockdep_assert_held(&mddev->reconfig_mutex); if (--mddev->suspended) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index acd681939112..612814d07d35 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -497,6 +497,7 @@ struct mddev { void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; unsigned int good_device_nr; /* good device num within cluster raid */ + unsigned int noio_flag; /* for memalloc scope API */ bool has_superblocks:1; bool fail_last_dev:1; From ba54d4d4d2844c234f1b4692bd8c9e0f833c8a54 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Apr 2020 22:17:21 +0800 Subject: [PATCH 07/13] raid5: remove gfp flags from scribble_alloc() Using GFP_NOIO flag to call scribble_alloc() from resize_chunk() does not have the expected behavior. kvmalloc_array() inside scribble_alloc() which receives the GFP_NOIO flag will eventually call kmalloc_node() to allocate physically continuous pages. Now we have memalloc scope APIs in mddev_suspend()/mddev_resume() to prevent memory reclaim I/Os during raid array suspend context, calling to kvmalloc_array() with GFP_KERNEL flag may avoid deadlock of recursive I/O as expected. This patch removes the useless gfp flags from parameters list of scribble_alloc(), and call kvmalloc_array() with GFP_KERNEL flag. The incorrect GFP_NOIO flag does not exist anymore. Fixes: b330e6a49dc3 ("md: convert to kvmalloc") Suggested-by: Michal Hocko Signed-off-by: Coly Li Signed-off-by: Song Liu --- drivers/md/raid5.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ba00e9877f02..190dd70db514 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2228,14 +2228,19 @@ static int grow_stripes(struct r5conf *conf, int num) * of the P and Q blocks. */ static int scribble_alloc(struct raid5_percpu *percpu, - int num, int cnt, gfp_t flags) + int num, int cnt) { size_t obj_size = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); void *scribble; - scribble = kvmalloc_array(cnt, obj_size, flags); + /* + * If here is in raid array suspend context, it is in memalloc noio + * context as well, there is no potential recursive memory reclaim + * I/Os with the GFP_KERNEL flag. + */ + scribble = kvmalloc_array(cnt, obj_size, GFP_KERNEL); if (!scribble) return -ENOMEM; @@ -2267,8 +2272,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) percpu = per_cpu_ptr(conf->percpu, cpu); err = scribble_alloc(percpu, new_disks, - new_sectors / STRIPE_SECTORS, - GFP_NOIO); + new_sectors / STRIPE_SECTORS); if (err) break; } @@ -6759,8 +6763,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu conf->previous_raid_disks), max(conf->chunk_sectors, conf->prev_chunk_sectors) - / STRIPE_SECTORS, - GFP_KERNEL)) { + / STRIPE_SECTORS)) { free_scratch_buffer(conf, percpu); return -ENOMEM; } From 7f8a30e5d253793d660f30b7475d7bb41efd40b8 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Apr 2020 22:17:22 +0800 Subject: [PATCH 08/13] raid5: update code comment of scribble_alloc() Code comments of scribble_alloc() is outdated for a while. This patch update the comments in function header for the new parameter list. Suggested-by: Song Liu Signed-off-by: Coly Li Signed-off-by: Song Liu --- drivers/md/raid5.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 190dd70db514..ab8067f9ce8c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2215,10 +2215,13 @@ static int grow_stripes(struct r5conf *conf, int num) } /** - * scribble_len - return the required size of the scribble region + * scribble_alloc - allocate percpu scribble buffer for required size + * of the scribble region + * @percpu - from for_each_present_cpu() of the caller * @num - total number of disks in the array + * @cnt - scribble objs count for required size of the scribble region * - * The size must be enough to contain: + * The scribble buffer size must be enough to contain: * 1/ a struct page pointer for each device in the array +2 * 2/ room to convert each entry in (1) to its corresponding dma * (dma_map_page()) or page (page_address()) address. From 3024ba2d6c5573b4797e62006d98f17f47f7d103 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Thu, 9 Apr 2020 22:17:23 +0800 Subject: [PATCH 09/13] md: remove redundant memalloc scope API usage In mddev_create_serial_pool(), memalloc scope APIs memalloc_noio_save() and memalloc_noio_restore() are used when allocating memory by calling mempool_create_kmalloc_pool(). After adding the memalloc scope APIs in raid array suspend context, it is unncessary to explicitly call them around mempool_create_kmalloc_pool() any longer. This patch removes the redundant memalloc scope APIs in mddev_create_serial_pool(). Signed-off-by: Coly Li Cc: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/md.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 1b3316c79b7b..7b2ac5f23260 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -228,13 +228,13 @@ void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev, goto abort; if (mddev->serial_info_pool == NULL) { - unsigned int noio_flag; - - noio_flag = memalloc_noio_save(); + /* + * already in memalloc noio context by + * mddev_suspend() + */ mddev->serial_info_pool = mempool_create_kmalloc_pool(NR_SERIAL_INFOS, sizeof(struct serial_info)); - memalloc_noio_restore(noio_flag); if (!mddev->serial_info_pool) { rdevs_uninit_serial(mddev); pr_err("can't alloc memory pool for serialization\n"); From c91114c2b89d5bea132d34b40568053f8bfa963d Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 27 Jan 2020 10:26:19 -0500 Subject: [PATCH 10/13] md/raid1: release pending accounting for an I/O only after write-behind is also finished When using RAID1 and write-behind, md can deadlock when errors occur. With write-behind, r1bio structs can be accounted by raid1 as queued but not counted as pending. The pending count is dropped when the original bio is returned complete but write-behind for the r1bio may still be active. This breaks the accounting used in some conditions to know when the raid1 md device has reached an idle state. It can result in calls to freeze_array deadlocking. freeze_array will never complete from a negative "unqueued" value being calculated due to a queued count larger than the pending count. To properly account for write-behind, move the call to allow_barrier from call_bio_endio to raid_end_bio_io. When using write-behind, md can call call_bio_endio before all write-behind I/O is complete. Using raid_end_bio_io for the point to call allow_barrier will release the pending count at a point where all I/O for an r1bio, even write-behind, is done. Signed-off-by: David Jeffery Signed-off-by: Song Liu --- drivers/md/raid1.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cd810e195086..dcd27f3da84e 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -296,22 +296,17 @@ static void reschedule_retry(struct r1bio *r1_bio) static void call_bio_endio(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; - struct r1conf *conf = r1_bio->mddev->private; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; bio_endio(bio); - /* - * Wake up any possible resync thread that waits for the device - * to go idle. - */ - allow_barrier(conf, r1_bio->sector); } static void raid_end_bio_io(struct r1bio *r1_bio) { struct bio *bio = r1_bio->master_bio; + struct r1conf *conf = r1_bio->mddev->private; /* if nobody has done the final endio yet, do it now */ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { @@ -322,6 +317,12 @@ static void raid_end_bio_io(struct r1bio *r1_bio) call_bio_endio(r1_bio); } + /* + * Wake up any possible resync thread that waits for the device + * to go idle. All I/Os, even write-behind writes, are done. + */ + allow_barrier(conf, r1_bio->sector); + free_r1bio(r1_bio); } From e4fc5a74293fbe8a1b8598b8a3c53592a5d70398 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 8 May 2020 18:15:14 +0200 Subject: [PATCH 11/13] md: stop using ->queuedata Pointer to mddev is already available in private_data. Signed-off-by: Christoph Hellwig Signed-off-by: Song Liu --- drivers/md/md.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7b2ac5f23260..79e36cb4e8b0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -467,7 +467,7 @@ static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio) { const int rw = bio_data_dir(bio); const int sgrp = op_stat_group(bio_op(bio)); - struct mddev *mddev = q->queuedata; + struct mddev *mddev = bio->bi_disk->private_data; unsigned int sectors; if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) { @@ -5644,7 +5644,6 @@ static int md_alloc(dev_t dev, char *name) mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE); if (!mddev->queue) goto abort; - mddev->queue->queuedata = mddev; blk_set_stacking_limits(&mddev->queue->limits); From 3f99980c8f70bc56584450c0a69973eef7b65913 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Mon, 11 May 2020 16:23:25 +0800 Subject: [PATCH 12/13] md: add a newline when printing parameter 'start_ro' by sysfs Add a missing newline when printing module parameter 'start_ro' by sysfs. Signed-off-by: Xiongfeng Wang Signed-off-by: Song Liu --- drivers/md/md.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 79e36cb4e8b0..f567f536b529 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -9810,7 +9810,7 @@ module_exit(md_exit) static int get_ro(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%d", start_readonly); + return sprintf(buffer, "%d\n", start_readonly); } static int set_ro(const char *val, const struct kernel_param *kp) { From 358369f03ac94637c9fd9d8f94a2dfde86b9f25f Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Thu, 7 May 2020 14:22:10 -0500 Subject: [PATCH 13/13] md/raid1: Replace zero-length array with flexible-array The current codebase makes use of the zero-length array language extension to the C90 standard, but the preferred mechanism to declare variable-length types such as these ones is a flexible array member[1][2], introduced in C99: struct foo { int stuff; struct boo array[]; }; By making use of the mechanism above, we will get a compiler warning in case the flexible array does not occur last in the structure, which will help us prevent some kind of undefined behavior bugs from being inadvertently introduced[3] to the codebase from now on. Also, notice that, dynamic memory allocations won't be affected by this change: "Flexible array members have incomplete type, and so the sizeof operator may not be applied. As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero."[1] sizeof(flexible-array-member) triggers a warning because flexible array members have incomplete type[1]. There are some instances of code in which the sizeof operator is being incorrectly/erroneously applied to zero-length arrays and the result is zero. Such instances may be hiding some bugs. So, this work (flexible-array member conversions) will also help to get completely rid of those sorts of issues. This issue was found with the help of Coccinelle. [1] https://gcc.gnu.org/onlinedocs/gcc/Zero-Length.html [2] https://github.com/KSPP/linux/issues/21 [3] commit 76497732932f ("cxgb3/l2t: Fix undefined behaviour") Signed-off-by: Gustavo A. R. Silva Signed-off-by: Song Liu --- drivers/md/md-linear.h | 2 +- drivers/md/raid1.h | 2 +- drivers/md/raid10.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h index 8381d651d4ed..24e97db50ebb 100644 --- a/drivers/md/md-linear.h +++ b/drivers/md/md-linear.h @@ -12,6 +12,6 @@ struct linear_conf struct rcu_head rcu; sector_t array_sectors; int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[0]; + struct dev_info disks[]; }; #endif diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index e7ccad898736..b7eb09e8c025 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -180,7 +180,7 @@ struct r1bio { * if the IO is in WRITE direction, then multiple bios are used. * We choose the number when they are allocated. */ - struct bio *bios[0]; + struct bio *bios[]; /* DO NOT PUT ANY NEW FIELDS HERE - bios array is contiguously alloced*/ }; diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index d3eaaf3eb1bc..79cd2b7d3128 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -153,7 +153,7 @@ struct r10bio { }; sector_t addr; int devnum; - } devs[0]; + } devs[]; }; /* bits for r10bio.state */