From e3e39c72b99f93bbd0420d38c858e7c4a061bb63 Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Fri, 21 Aug 2020 11:54:44 -0300 Subject: [PATCH 1/7] btrfs: block-group: fix free-space bitmap threshold [BUG] After commit 9afc66498a0b ("btrfs: block-group: refactor how we read one block group item"), cache->length is being assigned after calling btrfs_create_block_group_cache. This causes a problem since set_free_space_tree_thresholds calculates the free-space threshold to decide if the free-space tree should convert from extents to bitmaps. The current code calls set_free_space_tree_thresholds with cache->length being 0, which then makes cache->bitmap_high_thresh zero. This implies the system will always use bitmap instead of extents, which is not desired if the block group is not fragmented. This behavior can be seen by a test that expects to repair systems with FREE_SPACE_EXTENT and FREE_SPACE_BITMAP, but the current code only created FREE_SPACE_BITMAP. [FIX] Call set_free_space_tree_thresholds after setting cache->length. There is now a WARN_ON in set_free_space_tree_thresholds to help preventing the same mistake to happen again in the future. Link: https://github.com/kdave/btrfs-progs/issues/251 Fixes: 9afc66498a0b ("btrfs: block-group: refactor how we read one block group item") CC: stable@vger.kernel.org # 5.8+ Reviewed-by: Qu Wenruo Reviewed-by: Filipe Manana Signed-off-by: Marcos Paulo de Souza Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 4 +++- fs/btrfs/free-space-tree.c | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 613920c17ac1..ea8aaf36647e 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1798,7 +1798,6 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->fs_info = fs_info; cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); - set_free_space_tree_thresholds(cache); cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; @@ -1912,6 +1911,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (ret < 0) goto error; + set_free_space_tree_thresholds(cache); + if (need_clear) { /* * When we mount with old space cache, we need to @@ -2132,6 +2133,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, return -ENOMEM; cache->length = size; + set_free_space_tree_thresholds(cache); cache->used = bytes_used; cache->flags = type; cache->last_byte_to_unpin = (u64)-1; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 8b1f5c8897b7..6b9faf3b0e96 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -22,6 +22,10 @@ void set_free_space_tree_thresholds(struct btrfs_block_group *cache) size_t bitmap_size; u64 num_bitmaps, total_bitmap_size; + if (WARN_ON(cache->length == 0)) + btrfs_warn(cache->fs_info, "block group %llu length is zero", + cache->start); + /* * We convert to bitmaps when the disk space required for using extents * exceeds that required for using bitmaps. From 9771a5cf937129307d9f58922d60484d58ababe7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:26 -0400 Subject: [PATCH 2/7] btrfs: drop path before adding new uuid tree entry With the conversion of the tree locks to rwsem I got the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Not tainted ------------------------------------------------------ btrfs-uuid/7955 is trying to acquire lock: ffff88bfbafec0f8 (btrfs-root-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 but task is already holding lock: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (btrfs-uuid-00){++++}-{3:3}: down_read_nested+0x3e/0x140 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_uuid_tree_add+0x89/0x2d0 btrfs_uuid_scan_kthread+0x330/0x390 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 -> #0 (btrfs-root-00){++++}-{3:3}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 down_read_nested+0x3e/0x140 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_find_root+0x45/0x1b0 btrfs_read_tree_root+0x61/0x100 btrfs_get_root_ref.part.50+0x143/0x630 btrfs_uuid_tree_iterate+0x207/0x314 btrfs_uuid_rescan_kthread+0x12/0x50 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-uuid-00); lock(btrfs-root-00); lock(btrfs-uuid-00); lock(btrfs-root-00); *** DEADLOCK *** 1 lock held by btrfs-uuid/7955: #0: ffff88bfbafef2a8 (btrfs-uuid-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 stack backtrace: CPU: 73 PID: 7955 Comm: btrfs-uuid Kdump: loaded Not tainted 5.8.0-rc7-00167-g0d7ba0c5b375-dirty #925 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 ? __btrfs_tree_read_lock+0x39/0x180 ? btrfs_root_node+0x1c/0x1d0 down_read_nested+0x3e/0x140 ? __btrfs_tree_read_lock+0x39/0x180 __btrfs_tree_read_lock+0x39/0x180 __btrfs_read_lock_root_node+0x3a/0x50 btrfs_search_slot+0x4bd/0x990 btrfs_find_root+0x45/0x1b0 btrfs_read_tree_root+0x61/0x100 btrfs_get_root_ref.part.50+0x143/0x630 btrfs_uuid_tree_iterate+0x207/0x314 ? btree_readpage+0x20/0x20 btrfs_uuid_rescan_kthread+0x12/0x50 kthread+0x133/0x150 ? kthread_create_on_node+0x60/0x60 ret_from_fork+0x1f/0x30 This problem exists because we have two different rescan threads, btrfs_uuid_scan_kthread which creates the uuid tree, and btrfs_uuid_tree_iterate that goes through and updates or deletes any out of date roots. The problem is they both do things in different order. btrfs_uuid_scan_kthread() reads the tree_root, and then inserts entries into the uuid_root. btrfs_uuid_tree_iterate() scans the uuid_root, but then does a btrfs_get_fs_root() which can read from the tree_root. It's actually easy enough to not be holding the path in btrfs_uuid_scan_kthread() when we add a uuid entry, as we already drop it further down and re-start the search when we loop. So simply move the path release before we add our entry to the uuid tree. This also fixes a problem where we're holding a path open after we do btrfs_end_transaction(), which has it's own problems. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ee96c5869f57..214856c4ccb1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4462,6 +4462,7 @@ int btrfs_uuid_scan_kthread(void *data) goto skip; } update_tree: + btrfs_release_path(path); if (!btrfs_is_empty_uuid(root_item.uuid)) { ret = btrfs_uuid_tree_add(trans, root_item.uuid, BTRFS_UUID_KEY_SUBVOL, @@ -4486,6 +4487,7 @@ update_tree: } skip: + btrfs_release_path(path); if (trans) { ret = btrfs_end_transaction(trans); trans = NULL; @@ -4493,7 +4495,6 @@ skip: break; } - btrfs_release_path(path); if (key.offset < (u64)-1) { key.offset++; } else if (key.type < BTRFS_ROOT_ITEM_KEY) { From a48b73eca4ceb9b8a4b97f290a065335dbcd8a04 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:27 -0400 Subject: [PATCH 3/7] btrfs: fix potential deadlock in the search ioctl With the conversion of the tree locks to rwsem I got the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00165-g04ec4da5f45f-dirty #922 Not tainted ------------------------------------------------------ compsize/11122 is trying to acquire lock: ffff889fabca8768 (&mm->mmap_lock#2){++++}-{3:3}, at: __might_fault+0x3e/0x90 but task is already holding lock: ffff889fe720fe40 (btrfs-fs-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (btrfs-fs-00){++++}-{3:3}: down_write_nested+0x3b/0x70 __btrfs_tree_lock+0x24/0x120 btrfs_search_slot+0x756/0x990 btrfs_lookup_inode+0x3a/0xb4 __btrfs_update_delayed_inode+0x93/0x270 btrfs_async_run_delayed_root+0x168/0x230 btrfs_work_helper+0xd4/0x570 process_one_work+0x2ad/0x5f0 worker_thread+0x3a/0x3d0 kthread+0x133/0x150 ret_from_fork+0x1f/0x30 -> #1 (&delayed_node->mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_delayed_update_inode+0x50/0x440 btrfs_update_inode+0x8a/0xf0 btrfs_dirty_inode+0x5b/0xd0 touch_atime+0xa1/0xd0 btrfs_file_mmap+0x3f/0x60 mmap_region+0x3a4/0x640 do_mmap+0x376/0x580 vm_mmap_pgoff+0xd5/0x120 ksys_mmap_pgoff+0x193/0x230 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&mm->mmap_lock#2){++++}-{3:3}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 __might_fault+0x68/0x90 _copy_to_user+0x1e/0x80 copy_to_sk.isra.32+0x121/0x300 search_ioctl+0x106/0x200 btrfs_ioctl_tree_search_v2+0x7b/0xf0 btrfs_ioctl+0x106f/0x30a0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Chain exists of: &mm->mmap_lock#2 --> &delayed_node->mutex --> btrfs-fs-00 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(btrfs-fs-00); lock(&delayed_node->mutex); lock(btrfs-fs-00); lock(&mm->mmap_lock#2); *** DEADLOCK *** 1 lock held by compsize/11122: #0: ffff889fe720fe40 (btrfs-fs-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x39/0x180 stack backtrace: CPU: 17 PID: 11122 Comm: compsize Kdump: loaded Not tainted 5.8.0-rc7-00165-g04ec4da5f45f-dirty #922 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 ? __might_fault+0x3e/0x90 ? find_held_lock+0x72/0x90 __might_fault+0x68/0x90 ? __might_fault+0x3e/0x90 _copy_to_user+0x1e/0x80 copy_to_sk.isra.32+0x121/0x300 ? btrfs_search_forward+0x2a6/0x360 search_ioctl+0x106/0x200 btrfs_ioctl_tree_search_v2+0x7b/0xf0 btrfs_ioctl+0x106f/0x30a0 ? __do_sys_newfstat+0x5a/0x70 ? ksys_ioctl+0x83/0xc0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The problem is we're doing a copy_to_user() while holding tree locks, which can deadlock if we have to do a page fault for the copy_to_user(). This exists even without my locking changes, so it needs to be fixed. Rework the search ioctl to do the pre-fault and then copy_to_user_nofault for the copying. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- fs/btrfs/extent_io.h | 6 +++--- fs/btrfs/ioctl.c | 27 ++++++++++++++++++++------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 617ea38e6fd7..c15ab6c1897f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5653,9 +5653,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, } } -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dstv, - unsigned long start, unsigned long len) +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dstv, + unsigned long start, unsigned long len) { size_t cur; size_t offset; @@ -5675,7 +5675,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb, cur = min(len, (PAGE_SIZE - offset)); kaddr = page_address(page); - if (copy_to_user(dst, kaddr + offset, cur)) { + if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 00a88f2eb5ab..30794ae58498 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -241,9 +241,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, void read_extent_buffer(const struct extent_buffer *eb, void *dst, unsigned long start, unsigned long len); -int read_extent_buffer_to_user(const struct extent_buffer *eb, - void __user *dst, unsigned long start, - unsigned long len); +int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, + void __user *dst, unsigned long start, + unsigned long len); void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *src); void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *src); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bd3511c5ca81..ac45f022b495 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2086,9 +2086,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, sh.len = item_len; sh.transid = found_transid; - /* copy search result header */ - if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) { - ret = -EFAULT; + /* + * Copy search result header. If we fault then loop again so we + * can fault in the pages and -EFAULT there if there's a + * problem. Otherwise we'll fault and then copy the buffer in + * properly this next time through + */ + if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) { + ret = 0; goto out; } @@ -2096,10 +2101,14 @@ static noinline int copy_to_sk(struct btrfs_path *path, if (item_len) { char __user *up = ubuf + *sk_offset; - /* copy the item */ - if (read_extent_buffer_to_user(leaf, up, - item_off, item_len)) { - ret = -EFAULT; + /* + * Copy the item, same behavior as above, but reset the + * * sk_offset so we copy the full thing again. + */ + if (read_extent_buffer_to_user_nofault(leaf, up, + item_off, item_len)) { + ret = 0; + *sk_offset -= sizeof(sh); goto out; } @@ -2184,6 +2193,10 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { + ret = fault_in_pages_writeable(ubuf, *buf_size - sk_offset); + if (ret) + break; + ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { if (ret > 0) From e89c4a9c8e6ce3a84cab4f342687d3fbbb1234eb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:29 -0400 Subject: [PATCH 4/7] btrfs: allocate scrub workqueues outside of locks I got the following lockdep splat while testing: ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00172-g021118712e59 #932 Not tainted ------------------------------------------------------ btrfs/229626 is trying to acquire lock: ffffffff828513f0 (cpu_hotplug_lock){++++}-{0:0}, at: alloc_workqueue+0x378/0x450 but task is already holding lock: ffff889dd3889518 (&fs_info->scrub_lock){+.+.}-{3:3}, at: btrfs_scrub_dev+0x11c/0x630 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #7 (&fs_info->scrub_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_scrub_dev+0x11c/0x630 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4 btrfs_ioctl+0x2799/0x30a0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #6 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_run_dev_stats+0x49/0x480 commit_cowonly_roots+0xb5/0x2a0 btrfs_commit_transaction+0x516/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #5 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_commit_transaction+0x4bb/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (&fs_info->reloc_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_record_root_in_trans+0x43/0x70 start_transaction+0xd1/0x5d0 btrfs_dirty_inode+0x42/0xd0 touch_atime+0xa1/0xd0 btrfs_file_mmap+0x3f/0x60 mmap_region+0x3a4/0x640 do_mmap+0x376/0x580 vm_mmap_pgoff+0xd5/0x120 ksys_mmap_pgoff+0x193/0x230 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x68/0x90 _copy_to_user+0x1e/0x80 perf_read+0x141/0x2c0 vfs_read+0xad/0x1b0 ksys_read+0x5f/0xe0 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&cpuctx_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x88/0x150 perf_event_init+0x1db/0x20b start_kernel+0x3ae/0x53c secondary_startup_64+0xa4/0xb0 -> #1 (pmus_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x4f/0x150 cpuhp_invoke_callback+0xb1/0x900 _cpu_up.constprop.26+0x9f/0x130 cpu_up+0x7b/0xc0 bringup_nonboot_cpus+0x4f/0x60 smp_init+0x26/0x71 kernel_init_freeable+0x110/0x258 kernel_init+0xa/0x103 ret_from_fork+0x1f/0x30 -> #0 (cpu_hotplug_lock){++++}-{0:0}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 cpus_read_lock+0x39/0xb0 alloc_workqueue+0x378/0x450 __btrfs_alloc_workqueue+0x15d/0x200 btrfs_alloc_workqueue+0x51/0x160 scrub_workers_get+0x5a/0x170 btrfs_scrub_dev+0x18c/0x630 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4 btrfs_ioctl+0x2799/0x30a0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 other info that might help us debug this: Chain exists of: cpu_hotplug_lock --> &fs_devs->device_list_mutex --> &fs_info->scrub_lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&fs_info->scrub_lock); lock(&fs_devs->device_list_mutex); lock(&fs_info->scrub_lock); lock(cpu_hotplug_lock); *** DEADLOCK *** 2 locks held by btrfs/229626: #0: ffff88bfe8bb86e0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: btrfs_scrub_dev+0xbd/0x630 #1: ffff889dd3889518 (&fs_info->scrub_lock){+.+.}-{3:3}, at: btrfs_scrub_dev+0x11c/0x630 stack backtrace: CPU: 15 PID: 229626 Comm: btrfs Kdump: loaded Not tainted 5.8.0-rc7-00172-g021118712e59 #932 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 ? alloc_workqueue+0x378/0x450 cpus_read_lock+0x39/0xb0 ? alloc_workqueue+0x378/0x450 alloc_workqueue+0x378/0x450 ? rcu_read_lock_sched_held+0x52/0x80 __btrfs_alloc_workqueue+0x15d/0x200 btrfs_alloc_workqueue+0x51/0x160 scrub_workers_get+0x5a/0x170 btrfs_scrub_dev+0x18c/0x630 ? start_transaction+0xd1/0x5d0 btrfs_dev_replace_by_ioctl.cold.21+0x10a/0x1d4 btrfs_ioctl+0x2799/0x30a0 ? do_sigaction+0x102/0x250 ? lockdep_hardirqs_on_prepare+0xca/0x160 ? _raw_spin_unlock_irq+0x24/0x30 ? trace_hardirqs_on+0x1c/0xe0 ? _raw_spin_unlock_irq+0x24/0x30 ? do_sigaction+0x102/0x250 ? ksys_ioctl+0x83/0xc0 ksys_ioctl+0x83/0xc0 __x64_sys_ioctl+0x16/0x20 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This happens because we're allocating the scrub workqueues under the scrub and device list mutex, which brings in a whole host of other dependencies. Because the work queue allocation is done with GFP_KERNEL, it can trigger reclaim, which can lead to a transaction commit, which in turns needs the device_list_mutex, it can lead to a deadlock. A different problem for which this fix is a solution. Fix this by moving the actual allocation outside of the scrub lock, and then only take the lock once we're ready to actually assign them to the fs_info. We'll now have to cleanup the workqueues in a few more places, so I've added a helper to do the refcount dance to safely free the workqueues. CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 126 +++++++++++++++++++++++++++-------------------- 1 file changed, 72 insertions(+), 54 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5a6cb9db512e..354ab9985a34 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3716,50 +3716,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, return 0; } +static void scrub_workers_put(struct btrfs_fs_info *fs_info) +{ + if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, + &fs_info->scrub_lock)) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; + + scrub_workers = fs_info->scrub_workers; + scrub_wr_comp = fs_info->scrub_wr_completion_workers; + scrub_parity = fs_info->scrub_parity_workers; + + fs_info->scrub_workers = NULL; + fs_info->scrub_wr_completion_workers = NULL; + fs_info->scrub_parity_workers = NULL; + mutex_unlock(&fs_info->scrub_lock); + + btrfs_destroy_workqueue(scrub_workers); + btrfs_destroy_workqueue(scrub_wr_comp); + btrfs_destroy_workqueue(scrub_parity); + } +} + /* * get a reference count on fs_info->scrub_workers. start worker if necessary */ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { + struct btrfs_workqueue *scrub_workers = NULL; + struct btrfs_workqueue *scrub_wr_comp = NULL; + struct btrfs_workqueue *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; + int ret = -ENOMEM; - lockdep_assert_held(&fs_info->scrub_lock); + if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) + return 0; + scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, + is_dev_replace ? 1 : max_active, 4); + if (!scrub_workers) + goto fail_scrub_workers; + + scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, + max_active, 2); + if (!scrub_wr_comp) + goto fail_scrub_wr_completion_workers; + + scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, + max_active, 2); + if (!scrub_parity) + goto fail_scrub_parity_workers; + + mutex_lock(&fs_info->scrub_lock); if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) { - ASSERT(fs_info->scrub_workers == NULL); - fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", - flags, is_dev_replace ? 1 : max_active, 4); - if (!fs_info->scrub_workers) - goto fail_scrub_workers; - - ASSERT(fs_info->scrub_wr_completion_workers == NULL); - fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); - if (!fs_info->scrub_wr_completion_workers) - goto fail_scrub_wr_completion_workers; - - ASSERT(fs_info->scrub_parity_workers == NULL); - fs_info->scrub_parity_workers = - btrfs_alloc_workqueue(fs_info, "scrubparity", flags, - max_active, 2); - if (!fs_info->scrub_parity_workers) - goto fail_scrub_parity_workers; - + ASSERT(fs_info->scrub_workers == NULL && + fs_info->scrub_wr_completion_workers == NULL && + fs_info->scrub_parity_workers == NULL); + fs_info->scrub_workers = scrub_workers; + fs_info->scrub_wr_completion_workers = scrub_wr_comp; + fs_info->scrub_parity_workers = scrub_parity; refcount_set(&fs_info->scrub_workers_refcnt, 1); - } else { - refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + return 0; } - return 0; + /* Other thread raced in and created the workers for us */ + refcount_inc(&fs_info->scrub_workers_refcnt); + mutex_unlock(&fs_info->scrub_lock); + ret = 0; + btrfs_destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); + btrfs_destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(fs_info->scrub_workers); + btrfs_destroy_workqueue(scrub_workers); fail_scrub_workers: - return -ENOMEM; + return ret; } int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, @@ -3770,9 +3804,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, int ret; struct btrfs_device *dev; unsigned int nofs_flag; - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; if (btrfs_fs_closing(fs_info)) return -EAGAIN; @@ -3819,13 +3850,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (IS_ERR(sctx)) return PTR_ERR(sctx); + ret = scrub_workers_get(fs_info, is_dev_replace); + if (ret) + goto out_free_ctx; + mutex_lock(&fs_info->fs_devices->device_list_mutex); dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true); if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) && !is_dev_replace)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -ENODEV; - goto out_free_ctx; + goto out; } if (!is_dev_replace && !readonly && @@ -3834,7 +3869,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable", rcu_str_deref(dev->name)); ret = -EROFS; - goto out_free_ctx; + goto out; } mutex_lock(&fs_info->scrub_lock); @@ -3843,7 +3878,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EIO; - goto out_free_ctx; + goto out; } down_read(&fs_info->dev_replace.rwsem); @@ -3854,17 +3889,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->scrub_lock); mutex_unlock(&fs_info->fs_devices->device_list_mutex); ret = -EINPROGRESS; - goto out_free_ctx; + goto out; } up_read(&fs_info->dev_replace.rwsem); - ret = scrub_workers_get(fs_info, is_dev_replace); - if (ret) { - mutex_unlock(&fs_info->scrub_lock); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); - goto out_free_ctx; - } - sctx->readonly = readonly; dev->scrub_ctx = sctx; mutex_unlock(&fs_info->fs_devices->device_list_mutex); @@ -3917,24 +3945,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_lock(&fs_info->scrub_lock); dev->scrub_ctx = NULL; - if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) { - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; - - fs_info->scrub_workers = NULL; - fs_info->scrub_wr_completion_workers = NULL; - fs_info->scrub_parity_workers = NULL; - } mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + scrub_workers_put(fs_info); scrub_put_ctx(sctx); return ret; - +out: + scrub_workers_put(fs_info); out_free_ctx: scrub_free_ctx(sctx); From ad24466588ab7d7c879053c5afd919b0c555fec0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:30 -0400 Subject: [PATCH 5/7] btrfs: set the correct lockdep class for new nodes When flipping over to the rw_semaphore I noticed I'd get a lockdep splat in replace_path(), which is weird because we're swapping the reloc root with the actual target root. Turns out this is because we're using the root->root_key.objectid as the root id for the newly allocated tree block when setting the lockdep class, however we need to be using the actual owner of this new block, which is saved in owner. The affected path is through btrfs_copy_root as all other callers of btrfs_alloc_tree_block (which calls init_new_buffer) have root_objectid == root->root_key.objectid . CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Filipe Manana Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 5871ef78edba..e9eedc053fc5 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4527,7 +4527,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ERR_PTR(-EUCLEAN); } - btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level); + btrfs_set_buffer_lockdep_class(owner, buf, level); btrfs_tree_lock(buf); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); From d3beaa253fd6fa40b8b18a216398e6e5376a9d21 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:31 -0400 Subject: [PATCH 6/7] btrfs: set the lockdep class for log tree extent buffers These are special extent buffers that get rewound in order to lookup the state of the tree at a specific point in time. As such they do not go through the normal initialization paths that set their lockdep class, so handle them appropriately when they are created and before they are locked. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cd1cd673bc0b..cd392da69b81 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1297,6 +1297,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, btrfs_tree_read_unlock_blocking(eb); free_extent_buffer(eb); + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); btrfs_tree_read_lock(eb_rewin); __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); WARN_ON(btrfs_header_nritems(eb_rewin) > @@ -1370,7 +1372,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (!eb) return NULL; - btrfs_tree_read_lock(eb); if (old_root) { btrfs_set_header_bytenr(eb, eb->start); btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); @@ -1378,6 +1379,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_set_header_level(eb, old_root->level); btrfs_set_header_generation(eb, old_generation); } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); if (tm) __tree_mod_log_rewind(fs_info, eb, time_seq, tm); else From f96d6960abbc52e26ad124e69e6815283d3e1674 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 25 Aug 2020 21:42:51 +0800 Subject: [PATCH 7/7] btrfs: tree-checker: fix the error message for transid error The error message for inode transid is the same as for inode generation, which makes us unable to detect the real problem. Reported-by: Tyler Richmond Fixes: 496245cac57e ("btrfs: tree-checker: Verify inode item") CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Marcos Paulo de Souza Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 517b44300a05..7b1fee630f97 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -984,7 +984,7 @@ static int check_inode_item(struct extent_buffer *leaf, /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { inode_item_err(leaf, slot, - "invalid inode generation: has %llu expect [0, %llu]", + "invalid inode transid: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; }