From 83ae4133ac9410ac6a57136e464d498dc66200cf Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:09 -0400 Subject: [PATCH 001/249] btrfs: add a cached_state to try_lock_extent With nowait becoming more pervasive throughout our codebase go ahead and add a cached_state to try_lock_extent(). This allows us to be faster about clearing the locked area if we have contention, and then gives us the same optimization for unlock if we are able to lock the range. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 7 ++++--- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/extent_io.c | 3 ++- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 3 ++- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/relocation.c | 2 +- 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 83cb0378096f..1b0a45b51f4c 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1615,17 +1615,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - NULL, NULL, GFP_NOFS); + cached, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, - EXTENT_LOCKED, NULL); + EXTENT_LOCKED, cached); return 0; } return 1; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index a855f40dd61d..786be8f38f0b 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -106,7 +106,8 @@ void extent_io_tree_release(struct extent_io_tree *tree); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached); -int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached); int __init extent_state_init_cachep(void); void __cold extent_state_free_cachep(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4dcf22e051ff..c7e94a0e60d5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4959,7 +4959,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { - if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1)) + if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + NULL)) return -EAGAIN; } else { ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index d01631d47806..98107466572b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1302,7 +1302,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) { + if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state)) { for (i = 0; i < num_pages; i++) { unlock_page(pages[i]); put_page(pages[i]); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0e516aefbf51..2ba2d8b9cefc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7255,7 +7255,8 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend)) + if (!try_lock_extent(io_tree, lockstart, lockend, + cached_state)) return -EAGAIN; } else { lock_extent(io_tree, lockstart, lockend, cached_state); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index e54f8280031f..b648c9d4ea0f 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1073,7 +1073,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end)) + if (!try_lock_extent(&inode->io_tree, start, end, NULL)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 666a37a0ee89..e81a21082e58 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1120,7 +1120,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end); + key.offset, end, NULL); if (!ret) continue; From 632ddfa2131f0fea1831bc1f4b28c68faa779156 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:10 -0400 Subject: [PATCH 002/249] btrfs: use cached_state for btrfs_check_nocow_lock Now that try_lock_extent() takes a cached_state, plumb the cached_state through btrfs_try_lock_ordered_range() and then use a cached_state in btrfs_check_nocow_lock everywhere to avoid extra tree searches on the extent_io_tree. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 ++++++--- fs/btrfs/ordered-data.c | 7 ++++--- fs/btrfs/ordered-data.h | 3 ++- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 98107466572b..493cae66e5e6 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1373,6 +1373,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; + struct extent_state *cached_state = NULL; u64 lockstart, lockend; u64 num_bytes; int ret; @@ -1389,12 +1390,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, num_bytes = lockend - lockstart + 1; if (nowait) { - if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) { + if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend, + &cached_state)) { btrfs_drew_write_unlock(&root->snapshot_lock); return -EAGAIN; } } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, + &cached_state); } ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL, nowait, false); @@ -1403,7 +1406,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, NULL); + unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index b648c9d4ea0f..de2b716d3e7b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -1069,11 +1069,12 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, * Return true if btrfs_lock_ordered_range does not return any extents, * otherwise false. */ -bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, NULL)) + if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1081,7 +1082,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end) return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, NULL); + unlock_extent(&inode->io_tree, start, end, cached_state); return false; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f59f2dbdb25e..89f82b78f590 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -206,7 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached_state); -bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end); +bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state); int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, u64 post); int __init ordered_data_init(void); From 9c5c9604631ae5fcdc5124c79d01d75f80b5ffd4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:11 -0400 Subject: [PATCH 003/249] btrfs: use a cached_state everywhere in relocation All of the relocation code avoids using the cached state, despite everywhere using the normal lock_extent() // do something unlock_extent() pattern. Fix this by plumbing a cached state throughout all of these functions in order to allow for less tree searches. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e81a21082e58..9dcf9b39c798 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1113,6 +1113,8 @@ int replace_file_extents(struct btrfs_trans_handle *trans, inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { + struct extent_state *cached_state = NULL; + end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); WARN_ON(!IS_ALIGNED(key.offset, @@ -1120,14 +1122,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans, WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize)); end--; ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, NULL); + key.offset, end, + &cached_state); if (!ret) continue; btrfs_drop_extent_map_range(BTRFS_I(inode), key.offset, end, true); unlock_extent(&BTRFS_I(inode)->io_tree, - key.offset, end, NULL); + key.offset, end, &cached_state); } } @@ -1516,6 +1519,8 @@ static int invalidate_extent_cache(struct btrfs_root *root, objectid = min_key->objectid; while (1) { + struct extent_state *cached_state = NULL; + cond_resched(); iput(inode); @@ -1566,9 +1571,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); } return 0; } @@ -2863,19 +2868,21 @@ static noinline_for_stack int prealloc_file_extent_cluster( btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { + struct extent_state *cached_state = NULL; + start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) end = cluster->boundary[nr + 1] - 1 - offset; else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, NULL); + lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, NULL); + unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } @@ -2891,6 +2898,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod u64 start, u64 end, u64 block_start) { struct extent_map *em; + struct extent_state *cached_state = NULL; int ret = 0; em = alloc_extent_map(); @@ -2903,9 +2911,9 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->block_start = block_start; set_bit(EXTENT_FLAG_PINNED, &em->flags); - lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); - unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); free_extent_map(em); return ret; @@ -2983,6 +2991,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, */ cur = max(page_start, cluster->boundary[*cluster_nr] - offset); while (cur <= page_end) { + struct extent_state *cached_state = NULL; u64 extent_start = cluster->boundary[*cluster_nr] - offset; u64 extent_end = get_cluster_boundary_end(cluster, *cluster_nr) - offset; @@ -2998,13 +3007,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); + lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, - clamped_end, 0, NULL); + clamped_end, 0, &cached_state); if (ret) { - clear_extent_bits(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY); + clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -3031,7 +3042,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, boundary_start, boundary_end, EXTENT_BOUNDARY); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL); + unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; From 123a7f008c9e2b25b451c116620f1f6c77bd6b2b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:12 -0400 Subject: [PATCH 004/249] btrfs: cache the failed state when locking extents Currently if we fail to lock a range we'll return the start of the range that we failed to lock. We'll then search down to this range and wait on any extent states in this range. However we can avoid this search altogether if we simply cache the extent_state that had the contention. We can pass this into wait_extent_bit() and start from that extent_state without doing the search. In the most optimistic case we can avoid all searches, more likely we'll avoid the initial search and have to perform the search after we wait on the failed state, or worst case we must search both times which is what currently happens. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 52 +++++++++++++++++++++++++++++---------- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/extent_io.c | 3 ++- 3 files changed, 43 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 1b0a45b51f4c..a630c771d25c 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -714,7 +714,8 @@ static void wait_on_state(struct extent_io_tree *tree, * The range [start, end] is inclusive. * The tree lock is taken by this function */ -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; @@ -722,6 +723,16 @@ void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) spin_lock(&tree->lock); again: + /* + * Maintain cached_state, as we may not remove it from the tree if there + * are more bits than the bits we're waiting on set on this state. + */ + if (cached_state && *cached_state) { + state = *cached_state; + if (extent_state_in_tree(state) && + state->start <= start && start < state->end) + goto process_node; + } while (1) { /* * This search will find all the extents that end after our @@ -752,6 +763,12 @@ process_node: } } out: + /* This state is no longer useful, clear it and free it up. */ + if (cached_state && *cached_state) { + state = *cached_state; + *cached_state = NULL; + free_extent_state(state); + } spin_unlock(&tree->lock); } @@ -939,13 +956,17 @@ out: * sleeping, so the gfp mask is used to indicate what is allowed. * * If any of the exclusive bits are set, this will fail with -EEXIST if some - * part of the range already has the desired bits set. The start of the - * existing range is returned in failed_start in this case. + * part of the range already has the desired bits set. The extent_state of the + * existing range is returned in failed_state in this case, and the start of the + * existing range is returned in failed_start. failed_state is used as an + * optimization for wait_extent_bit, failed_start must be used as the source of + * truth as failed_state may have changed since we returned. * * [start, end] is inclusive This takes the tree lock. */ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, u64 *failed_start, + struct extent_state **failed_state, struct extent_state **cached_state, struct extent_changeset *changeset, gfp_t mask) { @@ -964,7 +985,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (exclusive_bits) ASSERT(failed_start); else - ASSERT(failed_start == NULL); + ASSERT(failed_start == NULL && failed_state == NULL); again: if (!prealloc && gfpflags_allow_blocking(mask)) { /* @@ -1012,6 +1033,7 @@ hit_next: if (state->start == start && state->end <= end) { if (state->state & exclusive_bits) { *failed_start = state->start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1047,6 +1069,7 @@ hit_next: if (state->start < start) { if (state->state & exclusive_bits) { *failed_start = start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1125,6 +1148,7 @@ hit_next: if (state->start <= end && state->end > end) { if (state->state & exclusive_bits) { *failed_start = start; + cache_state(state, failed_state); err = -EEXIST; goto out; } @@ -1162,8 +1186,8 @@ out: int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, struct extent_state **cached_state, gfp_t mask) { - return __set_extent_bit(tree, start, end, bits, NULL, cached_state, - NULL, mask); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, + cached_state, NULL, mask); } /* @@ -1598,8 +1622,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCKED)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset, - GFP_NOFS); + return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, + changeset, GFP_NOFS); } int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, @@ -1622,7 +1646,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - cached, NULL, GFP_NOFS); + NULL, cached, NULL, GFP_NOFS); if (err == -EEXIST) { if (failed_start > start) clear_extent_bit(tree, start, failed_start - 1, @@ -1639,20 +1663,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, struct extent_state **cached_state) { + struct extent_state *failed_state = NULL; int err; u64 failed_start; err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start, - cached_state, NULL, GFP_NOFS); + &failed_state, cached_state, NULL, GFP_NOFS); while (err == -EEXIST) { if (failed_start != start) clear_extent_bit(tree, start, failed_start - 1, EXTENT_LOCKED, cached_state); - wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED, + &failed_state); err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, - &failed_start, cached_state, NULL, - GFP_NOFS); + &failed_start, &failed_state, + cached_state, NULL, GFP_NOFS); } return err; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 786be8f38f0b..c71aa29f719d 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -235,6 +235,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits); +void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state); #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c7e94a0e60d5..836851233618 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5001,7 +5001,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (ret || wait != WAIT_COMPLETE) return ret; - wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED); + wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, + EXTENT_LOCKED, NULL); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) ret = -EIO; return ret; From e5e886bad9e9e87b767ade3884faec1cfdec9b43 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 30 Sep 2022 16:45:13 -0400 Subject: [PATCH 005/249] btrfs: add cached_state to read_extent_buffer_subpage We don't use a cached state here at all, which generally makes sense as async reads are going to unlock at endio time. However for blocking reads we will call wait_extent_bit() for our range. Since the lock_extent() stuff will return the cached_state for the start of the range this is a helpful optimization to have for this case, we'll have the exact state we want to wait on. Add a cached state here and simply throw it away if we're a non-blocking read, otherwise we'll get a small improvement by eliminating some tree searches. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 836851233618..7891d375eb43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4949,6 +4949,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; + struct extent_state *cached_state = NULL; struct btrfs_bio_ctrl bio_ctrl = { .mirror_num = mirror_num, }; @@ -4960,10 +4961,11 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, if (wait == WAIT_NONE) { if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1, - NULL)) + &cached_state)) return -EAGAIN; } else { - ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); + ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); if (ret < 0) return ret; } @@ -4973,7 +4975,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, PageUptodate(page) || btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) { set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, NULL); + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + &cached_state); return ret; } @@ -4998,11 +5001,13 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, atomic_dec(&eb->io_pages); } submit_one_bio(&bio_ctrl); - if (ret || wait != WAIT_COMPLETE) + if (ret || wait != WAIT_COMPLETE) { + free_extent_state(cached_state); return ret; + } wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, - EXTENT_LOCKED, NULL); + EXTENT_LOCKED, &cached_state); if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) ret = -EIO; return ret; From c1b078545e64da7a2b67b74b9d12813b5dd2a529 Mon Sep 17 00:00:00 2001 From: Peng Hao Date: Fri, 7 Oct 2022 18:33:35 +0200 Subject: [PATCH 006/249] btrfs: simplify cleanup after error in btrfs_create_tree Since leaf is already NULL, and no other branch will go to fail_unlock, the fail_unlock label is useless and can be removed Signed-off-by: Peng Hao Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d99bf7c64611..3460eaa9e4aa 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1197,7 +1197,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; - goto fail_unlock; + goto fail; } root->node = leaf; @@ -1232,9 +1232,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, return root; -fail_unlock: - if (leaf) - btrfs_tree_unlock(leaf); fail: btrfs_put_root(root); From d60d956eb41f0945c2438af11ed412109c558f4f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:25 -0400 Subject: [PATCH 007/249] btrfs: remove unused set/clear_pending_info helpers The last users of these helpers were removed in 5297199a8bca ("btrfs: remove inode number cache feature") so delete these helpers. The point was for mount options that were applicable after transaction commit so they could not be applied immediately. We don't have such options anymore and if we do the patch can be reverted. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9e6d48ff4597..3be1db2c558f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1661,30 +1661,6 @@ do { \ #define btrfs_clear_pending(info, opt) \ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -/* - * Helpers for setting pending mount option changes. - * - * Expects corresponding macros - * BTRFS_PENDING_SET_ and CLEAR_ + short mount option name - */ -#define btrfs_set_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (!btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), SET_##opt); \ - btrfs_clear_pending((info), CLEAR_##opt); \ - } \ -} while(0) - -#define btrfs_clear_pending_and_info(info, opt, fmt, args...) \ -do { \ - if (btrfs_raw_test_opt((info)->mount_opt, opt)) { \ - btrfs_info((info), fmt, ##args); \ - btrfs_set_pending((info), CLEAR_##opt); \ - btrfs_clear_pending((info), SET_##opt); \ - } \ -} while(0) - /* * Inode flags */ From ea206640a600c808d30d8d62c44b6982c12a143b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:26 -0400 Subject: [PATCH 008/249] btrfs: remove unused BTRFS_TOTAL_BYTES_PINNED_BATCH This hasn't been used since 138a12d86574 ("btrfs: rip out btrfs_space_info::total_bytes_pinned") so it is safe to remove. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3be1db2c558f..8bd49328efab 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -92,14 +92,6 @@ struct reloc_control; #define BTRFS_DIRTY_METADATA_THRESH SZ_32M -/* - * Use large batch size to reduce overhead of metadata updates. On the reader - * side, we only read it when we are close to ENOSPC and the read overhead is - * mostly related to the number of CPUs, so it is OK to use arbitrary large - * value here. - */ -#define BTRFS_TOTAL_BYTES_PINNED_BATCH SZ_128M - #define BTRFS_MAX_EXTENT_SIZE SZ_128M /* From 4ce76e8e783693c72956e817f3a6dbbaa055411a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:27 -0400 Subject: [PATCH 009/249] btrfs: remove unused BTRFS_IOPRIO_READA The last user of this definition was removed in patch f26c92386028 ("btrfs: remove reada infrastructure") so we can remove this definition. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8bd49328efab..7760cc07288e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -87,9 +87,6 @@ struct reloc_control; #define BTRFS_EMPTY_DIR_SIZE 0 -/* ioprio of readahead is set to idle */ -#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) - #define BTRFS_DIRTY_METADATA_THRESH SZ_32M #define BTRFS_MAX_EXTENT_SIZE SZ_128M From 4300c58f809079951c87d84e5f11a2d265e3c9e7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:28 -0400 Subject: [PATCH 010/249] btrfs: move btrfs on-disk definitions out of ctree.h The bulk of our on-disk definitions exist in btrfs_tree.h, which user space can use. Keep things consistent and move the rest of the on disk definitions out of ctree.h into btrfs_tree.h. Note I did have to update all u8's to __u8, but otherwise this is a strict copy and paste. Most of the definitions are mainly for internal use and are not guaranteed stable public API and may change as we need. Compilation failures by user applications can happen. Reviewed-by: Qu Wenruo Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments, style fixups ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 215 +------------------------------- include/uapi/linux/btrfs_tree.h | 215 ++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 214 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7760cc07288e..7a80f9380146 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -55,8 +55,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ - /* * Maximum number of mirrors that can be available for all profiles counting * the target device of dev-replace as one. During an active device replace @@ -68,8 +66,6 @@ struct reloc_control; */ #define BTRFS_MAX_MIRRORS (4 + 1) -#define BTRFS_MAX_LEVEL 8 - #define BTRFS_OLDEST_GENERATION 0ULL /* @@ -138,81 +134,9 @@ enum { BTRFS_FS_STATE_COUNT }; -#define BTRFS_BACKREF_REV_MAX 256 -#define BTRFS_BACKREF_REV_SHIFT 56 -#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ - BTRFS_BACKREF_REV_SHIFT) - -#define BTRFS_OLD_BACKREF_REV 0 -#define BTRFS_MIXED_BACKREF_REV 1 - -/* - * every tree block (leaf or node) starts with this header. - */ -struct btrfs_header { - /* these first four must match the super block */ - u8 csum[BTRFS_CSUM_SIZE]; - u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - __le64 bytenr; /* which block this node is supposed to live in */ - __le64 flags; - - /* allowed to be different from the super from here on down */ - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - __le64 generation; - __le64 owner; - __le32 nritems; - u8 level; -} __attribute__ ((__packed__)); - -/* - * this is a very generous portion of the super block, giving us - * room to translate 14 chunks with 3 stripes each. - */ -#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 - -/* - * just in case we somehow lose the roots and are not able to mount, - * we store an array of the roots from previous transactions - * in the super. - */ -#define BTRFS_NUM_BACKUP_ROOTS 4 -struct btrfs_root_backup { - __le64 tree_root; - __le64 tree_root_gen; - - __le64 chunk_root; - __le64 chunk_root_gen; - - __le64 extent_root; - __le64 extent_root_gen; - - __le64 fs_root; - __le64 fs_root_gen; - - __le64 dev_root; - __le64 dev_root_gen; - - __le64 csum_root; - __le64 csum_root_gen; - - __le64 total_bytes; - __le64 bytes_used; - __le64 num_devices; - /* future */ - __le64 unused_64[4]; - - u8 tree_root_level; - u8 chunk_root_level; - u8 extent_root_level; - u8 fs_root_level; - u8 dev_root_level; - u8 csum_root_level; - /* future and to align */ - u8 unused_8[10]; -} __attribute__ ((__packed__)); - #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); /* * The reserved space at the beginning of each device. @@ -221,69 +145,6 @@ struct btrfs_root_backup { */ #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) -/* - * the super block basically lists the main trees of the FS - * it currently lacks any block count etc etc - */ -struct btrfs_super_block { - /* the first 4 fields must match struct btrfs_header */ - u8 csum[BTRFS_CSUM_SIZE]; - /* FS specific UUID, visible to user */ - u8 fsid[BTRFS_FSID_SIZE]; - __le64 bytenr; /* this block number */ - __le64 flags; - - /* allowed to be different from the btrfs_header from here own down */ - __le64 magic; - __le64 generation; - __le64 root; - __le64 chunk_root; - __le64 log_root; - - /* - * This member has never been utilized since the very beginning, thus - * it's always 0 regardless of kernel version. We always use - * generation + 1 to read log tree root. So here we mark it deprecated. - */ - __le64 __unused_log_root_transid; - __le64 total_bytes; - __le64 bytes_used; - __le64 root_dir_objectid; - __le64 num_devices; - __le32 sectorsize; - __le32 nodesize; - __le32 __unused_leafsize; - __le32 stripesize; - __le32 sys_chunk_array_size; - __le64 chunk_root_generation; - __le64 compat_flags; - __le64 compat_ro_flags; - __le64 incompat_flags; - __le16 csum_type; - u8 root_level; - u8 chunk_root_level; - u8 log_root_level; - struct btrfs_dev_item dev_item; - - char label[BTRFS_LABEL_SIZE]; - - __le64 cache_generation; - __le64 uuid_tree_generation; - - /* the UUID written into btree blocks */ - u8 metadata_uuid[BTRFS_FSID_SIZE]; - - /* future expansion */ - u8 reserved8[8]; - __le64 reserved[27]; - u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; - struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; - - /* Padded to 4096 bytes */ - u8 padding[565]; -} __attribute__ ((__packed__)); -static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); - /* * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount @@ -341,43 +202,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL -/* - * A leaf is full of items. offset and size tell us where to find - * the item in the leaf (relative to the start of the data area) - */ -struct btrfs_item { - struct btrfs_disk_key key; - __le32 offset; - __le32 size; -} __attribute__ ((__packed__)); - -/* - * leaves have an item area and a data area: - * [item0, item1....itemN] [free space] [dataN...data1, data0] - * - * The data is separate from the items to get the keys closer together - * during searches. - */ -struct btrfs_leaf { - struct btrfs_header header; - struct btrfs_item items[]; -} __attribute__ ((__packed__)); - -/* - * all non-leaf blocks are nodes, they hold only keys and pointers to - * other blocks - */ -struct btrfs_key_ptr { - struct btrfs_disk_key key; - __le64 blockptr; - __le64 generation; -} __attribute__ ((__packed__)); - -struct btrfs_node { - struct btrfs_header header; - struct btrfs_key_ptr ptrs[]; -} __attribute__ ((__packed__)); - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, @@ -1650,43 +1474,6 @@ do { \ #define btrfs_clear_pending(info, opt) \ clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -/* - * Inode flags - */ -#define BTRFS_INODE_NODATASUM (1U << 0) -#define BTRFS_INODE_NODATACOW (1U << 1) -#define BTRFS_INODE_READONLY (1U << 2) -#define BTRFS_INODE_NOCOMPRESS (1U << 3) -#define BTRFS_INODE_PREALLOC (1U << 4) -#define BTRFS_INODE_SYNC (1U << 5) -#define BTRFS_INODE_IMMUTABLE (1U << 6) -#define BTRFS_INODE_APPEND (1U << 7) -#define BTRFS_INODE_NODUMP (1U << 8) -#define BTRFS_INODE_NOATIME (1U << 9) -#define BTRFS_INODE_DIRSYNC (1U << 10) -#define BTRFS_INODE_COMPRESS (1U << 11) - -#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) - -#define BTRFS_INODE_FLAG_MASK \ - (BTRFS_INODE_NODATASUM | \ - BTRFS_INODE_NODATACOW | \ - BTRFS_INODE_READONLY | \ - BTRFS_INODE_NOCOMPRESS | \ - BTRFS_INODE_PREALLOC | \ - BTRFS_INODE_SYNC | \ - BTRFS_INODE_IMMUTABLE | \ - BTRFS_INODE_APPEND | \ - BTRFS_INODE_NODUMP | \ - BTRFS_INODE_NOATIME | \ - BTRFS_INODE_DIRSYNC | \ - BTRFS_INODE_COMPRESS | \ - BTRFS_INODE_ROOT_ITEM_INIT) - -#define BTRFS_INODE_RO_VERITY (1U << 0) - -#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) - struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 1f7a38ec6ac3..d43f67f676c2 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -10,6 +10,11 @@ #include #endif +/* ASCII for _BHRfS_M, no terminating nul */ +#define BTRFS_MAGIC 0x4D5F53665248425FULL + +#define BTRFS_MAX_LEVEL 8 + /* * This header contains the structure definitions and constants used * by file system objects that can be retrieved using @@ -360,6 +365,43 @@ enum btrfs_csum_type { #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1U << 0) +#define BTRFS_INODE_NODATACOW (1U << 1) +#define BTRFS_INODE_READONLY (1U << 2) +#define BTRFS_INODE_NOCOMPRESS (1U << 3) +#define BTRFS_INODE_PREALLOC (1U << 4) +#define BTRFS_INODE_SYNC (1U << 5) +#define BTRFS_INODE_IMMUTABLE (1U << 6) +#define BTRFS_INODE_APPEND (1U << 7) +#define BTRFS_INODE_NODUMP (1U << 8) +#define BTRFS_INODE_NOATIME (1U << 9) +#define BTRFS_INODE_DIRSYNC (1U << 10) +#define BTRFS_INODE_COMPRESS (1U << 11) + +#define BTRFS_INODE_ROOT_ITEM_INIT (1U << 31) + +#define BTRFS_INODE_FLAG_MASK \ + (BTRFS_INODE_NODATASUM | \ + BTRFS_INODE_NODATACOW | \ + BTRFS_INODE_READONLY | \ + BTRFS_INODE_NOCOMPRESS | \ + BTRFS_INODE_PREALLOC | \ + BTRFS_INODE_SYNC | \ + BTRFS_INODE_IMMUTABLE | \ + BTRFS_INODE_APPEND | \ + BTRFS_INODE_NODUMP | \ + BTRFS_INODE_NOATIME | \ + BTRFS_INODE_DIRSYNC | \ + BTRFS_INODE_COMPRESS | \ + BTRFS_INODE_ROOT_ITEM_INIT) + +#define BTRFS_INODE_RO_VERITY (1U << 0) + +#define BTRFS_INODE_RO_FLAG_MASK (BTRFS_INODE_RO_VERITY) + /* * The key defines the order in the tree, and so it also defines (optimal) * block layout. @@ -389,6 +431,109 @@ struct btrfs_key { __u64 offset; } __attribute__ ((__packed__)); +/* + * Every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* These first four must match the super block */ + __u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific uuid */ + __u8 fsid[BTRFS_FSID_SIZE]; + /* Which block this node is supposed to live in */ + __le64 bytenr; + __le64 flags; + + /* Allowed to be different from the super from here on down */ + __u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + __u8 level; +} __attribute__ ((__packed__)); + +/* + * This is a very generous portion of the super block, giving us room to + * translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 + +/* + * Just in case we somehow lose the roots and are not able to mount, we store + * an array of the roots from previous transactions in the super. + */ +#define BTRFS_NUM_BACKUP_ROOTS 4 +struct btrfs_root_backup { + __le64 tree_root; + __le64 tree_root_gen; + + __le64 chunk_root; + __le64 chunk_root_gen; + + __le64 extent_root; + __le64 extent_root_gen; + + __le64 fs_root; + __le64 fs_root_gen; + + __le64 dev_root; + __le64 dev_root_gen; + + __le64 csum_root; + __le64 csum_root_gen; + + __le64 total_bytes; + __le64 bytes_used; + __le64 num_devices; + /* future */ + __le64 unused_64[4]; + + __u8 tree_root_level; + __u8 chunk_root_level; + __u8 extent_root_level; + __u8 fs_root_level; + __u8 dev_root_level; + __u8 csum_root_level; + /* future and to align */ + __u8 unused_8[10]; +} __attribute__ ((__packed__)); + +/* + * A leaf is full of items. offset and size tell us where to find the item in + * the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * Leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together during + * searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * All non-leaf blocks are nodes, they hold only keys and pointers to other + * blocks. + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + struct btrfs_dev_item { /* the internal btrfs device id */ __le64 devid; @@ -472,6 +617,68 @@ struct btrfs_chunk { /* additional stripes go here */ } __attribute__ ((__packed__)); +/* + * The super block basically lists the main trees of the FS. + */ +struct btrfs_super_block { + /* The first 4 fields must match struct btrfs_header */ + __u8 csum[BTRFS_CSUM_SIZE]; + /* FS specific UUID, visible to user */ + __u8 fsid[BTRFS_FSID_SIZE]; + /* This block number */ + __le64 bytenr; + __le64 flags; + + /* Allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 __unused_leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + __u8 root_level; + __u8 chunk_root_level; + __u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + __le64 cache_generation; + __le64 uuid_tree_generation; + + /* The UUID written into btree blocks */ + __u8 metadata_uuid[BTRFS_FSID_SIZE]; + + /* Future expansion */ + __u8 reserved8[8]; + __le64 reserved[27]; + __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; + struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; + + /* Padded to 4096 bytes */ + __u8 padding[565]; +} __attribute__ ((__packed__)); + #define BTRFS_FREE_SPACE_EXTENT 1 #define BTRFS_FREE_SPACE_BITMAP 2 @@ -526,6 +733,14 @@ struct btrfs_extent_item_v0 { /* use full backrefs for extent pointers in the block */ #define BTRFS_BLOCK_FLAG_FULL_BACKREF (1ULL << 8) +#define BTRFS_BACKREF_REV_MAX 256 +#define BTRFS_BACKREF_REV_SHIFT 56 +#define BTRFS_BACKREF_REV_MASK (((u64)BTRFS_BACKREF_REV_MAX - 1) << \ + BTRFS_BACKREF_REV_SHIFT) + +#define BTRFS_OLD_BACKREF_REV 0 +#define BTRFS_MIXED_BACKREF_REV 1 + /* * this flag is only used internally by scrub and may be changed at any time * it is only declared here to avoid collisions From 51129b33d3911c7a36e643d47cf7c00fba3089fe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:29 -0400 Subject: [PATCH 011/249] btrfs: move btrfs_get_block_group helper out of disk-io.h This inline helper calls btrfs_fs_compat_ro(), which is defined in another header. To avoid weird header dependency problems move this helper into disk-io.c with the rest of the global root helpers. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 7 +++++++ fs/btrfs/disk-io.h | 8 +------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3460eaa9e4aa..5705335c80c8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1167,6 +1167,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) return btrfs_global_root(fs_info, &key); } +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) +{ + if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) + return fs_info->block_group_root; + return btrfs_extent_root(fs_info, 0); +} + struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid) { diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 9fa923e005a3..da040ec327ae 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -75,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info); void btrfs_free_fs_info(struct btrfs_fs_info *fs_info); int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); @@ -106,13 +107,6 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) return NULL; } -static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) -{ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) - return fs_info->block_group_root; - return btrfs_extent_root(fs_info, 0); -} - void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, From ad4b63caf56d0dc08e03966172d685ff9ebad996 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:30 -0400 Subject: [PATCH 012/249] btrfs: move maximum limits to btrfs_tree.h We have maximum link and name length limits, move these to btrfs_tree.h as they're on disk limitations. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 13 ------------- include/uapi/linux/btrfs_tree.h | 12 ++++++++++++ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7a80f9380146..6a37df0fa0eb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -68,19 +68,6 @@ struct reloc_control; #define BTRFS_OLDEST_GENERATION 0ULL -/* - * we can actually store much bigger names, but lets not confuse the rest - * of linux - */ -#define BTRFS_NAME_LEN 255 - -/* - * Theoretical limit is larger, but we keep this down to a sane - * value. That should limit greatly the possibility of collisions on - * inode ref items. - */ -#define BTRFS_LINK_MAX 65535U - #define BTRFS_EMPTY_DIR_SIZE 0 #define BTRFS_DIRTY_METADATA_THRESH SZ_32M diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index d43f67f676c2..4809272f5063 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -15,6 +15,18 @@ #define BTRFS_MAX_LEVEL 8 +/* + * We can actually store much bigger names, but lets not confuse the rest of + * linux. + */ +#define BTRFS_NAME_LEN 255 + +/* + * Theoretical limit is larger, but we keep this down to a sane value. That + * should limit greatly the possibility of collisions on inode ref items. + */ +#define BTRFS_LINK_MAX 65535U + /* * This header contains the structure definitions and constants used * by file system objects that can be retrieved using From ed4c491a3db2e85b3eb04ac61f0723fc2ec5f50a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:31 -0400 Subject: [PATCH 013/249] btrfs: move BTRFS_MAX_MIRRORS into scrub.c This is only used locally in scrub.c, move it out of ctree.h into scrub.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 11 ----------- fs/btrfs/scrub.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6a37df0fa0eb..ea83c178a60c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -55,17 +55,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -/* - * Maximum number of mirrors that can be available for all profiles counting - * the target device of dev-replace as one. During an active device replace - * procedure, the target device of the copy operation is a mirror for the - * filesystem data as well that can be used to read data in order to repair - * read errors on other disks. - * - * Current value is derived from RAID1C4 with 4 copies. - */ -#define BTRFS_MAX_MIRRORS (4 + 1) - #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 196c4c6ed1ed..cdda4b2f20f2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -56,6 +56,17 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE)) +/* + * Maximum number of mirrors that can be available for all profiles counting + * the target device of dev-replace as one. During an active device replace + * procedure, the target device of the copy operation is a mirror for the + * filesystem data as well that can be used to read data in order to repair + * read errors on other disks. + * + * Current value is derived from RAID1C4 with 4 copies. + */ +#define BTRFS_MAX_MIRRORS (4 + 1) + struct scrub_recover { refcount_t refs; struct btrfs_io_context *bioc; From 390d89ccf672ec79b84037d4af834b2275ff7cb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:32 -0400 Subject: [PATCH 014/249] btrfs: move discard stat defs to free-space-cache.h These definitions are used for discard statistics, move them out of ctree.h and put them in free-space-cache.h. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 --------- fs/btrfs/free-space-cache.h | 9 +++++++++ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ea83c178a60c..03d665b21244 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -63,15 +63,6 @@ struct reloc_control; #define BTRFS_MAX_EXTENT_SIZE SZ_128M -/* - * Deltas are an effective way to populate global statistics. Give macro names - * to make it clear what we're doing. An example is discard_extents in - * btrfs_free_space_ctl. - */ -#define BTRFS_STAT_NR_ENTRIES 2 -#define BTRFS_STAT_CURR 0 -#define BTRFS_STAT_PREV 1 - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 6d419ba53e95..eaf30f6444dd 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -43,6 +43,15 @@ static inline bool btrfs_free_space_trimming_bitmap( return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); } +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +#define BTRFS_STAT_NR_ENTRIES 2 +#define BTRFS_STAT_CURR 0 +#define BTRFS_STAT_PREV 1 + struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; From 06d61cb101f346338b50be5cd4f053a169b59d5e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:34 -0400 Subject: [PATCH 015/249] btrfs: move btrfs_should_fragment_free_space into block-group.c This function uses functions that are not defined in block-group.h, move it into block-group.c in order to keep the header clean. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 12 ++++++++++++ fs/btrfs/block-group.h | 11 +---------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index deebc8ddbd93..3f8b1cbbbc43 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -18,6 +18,18 @@ #include "raid56.h" #include "zoned.h" +#ifdef CONFIG_BTRFS_DEBUG +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) +{ + struct btrfs_fs_info *fs_info = block_group->fs_info; + + return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && + block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || + (btrfs_test_opt(fs_info, FRAGMENT_DATA) && + block_group->flags & BTRFS_BLOCK_GROUP_DATA); +} +#endif + /* * Return target flags in extended format or 0 if restripe for this chunk_type * is not in progress diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 8fb14b99a1d1..39e79bed10ae 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -251,16 +251,7 @@ static inline bool btrfs_is_block_group_data_only( } #ifdef CONFIG_BTRFS_DEBUG -static inline int btrfs_should_fragment_free_space( - struct btrfs_block_group *block_group) -{ - struct btrfs_fs_info *fs_info = block_group->fs_info; - - return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) && - block_group->flags & BTRFS_BLOCK_GROUP_METADATA) || - (btrfs_test_opt(fs_info, FRAGMENT_DATA) && - block_group->flags & BTRFS_BLOCK_GROUP_DATA); -} +int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group); #endif struct btrfs_block_group *btrfs_lookup_first_block_group( From f1e5c6185ca166cde0c7c2eeeab5d233ef315140 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:35 -0400 Subject: [PATCH 016/249] btrfs: move flush related definitions to space-info.h This code is used in space-info.c, move the definitions to space-info.h. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 59 ---------------------------------------- fs/btrfs/delayed-inode.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/space-info.h | 59 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 63 insertions(+), 59 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 03d665b21244..39974f4b441f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2639,65 +2639,6 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, void btrfs_clear_space_info_full(struct btrfs_fs_info *info); -/* - * Different levels for to flush space when doing space reservations. - * - * The higher the level, the more methods we try to reclaim space. - */ -enum btrfs_reserve_flush_enum { - /* If we are in the transaction, we can't flush anything.*/ - BTRFS_RESERVE_NO_FLUSH, - - /* - * Flush space by: - * - Running delayed inode items - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_LIMIT, - - /* - * Flush space by: - * - Running delayed inode items - * - Running delayed refs - * - Running delalloc and waiting for ordered extents - * - Allocating a new chunk - */ - BTRFS_RESERVE_FLUSH_EVICT, - - /* - * Flush space by above mentioned methods and by: - * - Running delayed iputs - * - Committing transaction - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_DATA, - BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, - BTRFS_RESERVE_FLUSH_ALL, - - /* - * Pretty much the same as FLUSH_ALL, but can also steal space from - * global rsv. - * - * Can be interrupted by a fatal signal. - */ - BTRFS_RESERVE_FLUSH_ALL_STEAL, -}; - -enum btrfs_flush_state { - FLUSH_DELAYED_ITEMS_NR = 1, - FLUSH_DELAYED_ITEMS = 2, - FLUSH_DELAYED_REFS_NR = 3, - FLUSH_DELAYED_REFS = 4, - FLUSH_DELALLOC = 5, - FLUSH_DELALLOC_WAIT = 6, - FLUSH_DELALLOC_FULL = 7, - ALLOC_CHUNK = 8, - ALLOC_CHUNK_FORCE = 9, - RUN_DELAYED_IPUTS = 10, - COMMIT_TRANS = 11, -}; - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index cac5169eaf8d..a411f04a7b97 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -14,6 +14,7 @@ #include "qgroup.h" #include "locking.h" #include "inode-item.h" +#include "space-info.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 0eeb5ea87894..366f3a788c6a 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "transaction.h" #include "print-tree.h" +#include "space-info.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const char *name, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 055a631276ce..07f62e3ba6a5 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -10,6 +10,7 @@ #include "ctree.h" #include "xattr.h" #include "compression.h" +#include "space-info.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 9dcf9b39c798..216a4485d914 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -27,6 +27,7 @@ #include "subpage.h" #include "zoned.h" #include "inode-item.h" +#include "space-info.h" /* * Relocation overview diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index ce66023a9eb8..7e17bb803436 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -5,6 +5,65 @@ #include "volumes.h" +/* + * Different levels for to flush space when doing space reservations. + * + * The higher the level, the more methods we try to reclaim space. + */ +enum btrfs_reserve_flush_enum { + /* If we are in the transaction, we can't flush anything.*/ + BTRFS_RESERVE_NO_FLUSH, + + /* + * Flush space by: + * - Running delayed inode items + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_LIMIT, + + /* + * Flush space by: + * - Running delayed inode items + * - Running delayed refs + * - Running delalloc and waiting for ordered extents + * - Allocating a new chunk + */ + BTRFS_RESERVE_FLUSH_EVICT, + + /* + * Flush space by above mentioned methods and by: + * - Running delayed iputs + * - Committing transaction + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, + BTRFS_RESERVE_FLUSH_ALL, + + /* + * Pretty much the same as FLUSH_ALL, but can also steal space from + * global rsv. + * + * Can be interrupted by a fatal signal. + */ + BTRFS_RESERVE_FLUSH_ALL_STEAL, +}; + +enum btrfs_flush_state { + FLUSH_DELAYED_ITEMS_NR = 1, + FLUSH_DELAYED_ITEMS = 2, + FLUSH_DELAYED_REFS_NR = 3, + FLUSH_DELAYED_REFS = 4, + FLUSH_DELALLOC = 5, + FLUSH_DELALLOC_WAIT = 6, + FLUSH_DELALLOC_FULL = 7, + ALLOC_CHUNK = 8, + ALLOC_CHUNK_FORCE = 9, + RUN_DELAYED_IPUTS = 10, + COMMIT_TRANS = 11, +}; + struct btrfs_space_info { spinlock_t lock; From f60acad355cf14ccccf420e6ea0ddd6de87cb210 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:36 -0400 Subject: [PATCH 017/249] btrfs: move btrfs_print_data_csum_error into inode.c This isn't used outside of inode.c, there's no reason to define it in btrfs_inode.h. Drop the inline and add __cold as it's for errors that are not in any hot path. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 26 -------------------------- fs/btrfs/inode.c | 26 ++++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 54c2ccb36b61..530a0ebfab3f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -410,30 +410,4 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes - -static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, - u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) -{ - struct btrfs_root *root = inode->root; - const u32 csum_size = root->fs_info->csum_size; - - /* Output minus objectid, which is more meaningful */ - if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) - btrfs_warn_rl(root->fs_info, -"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); - else - btrfs_warn_rl(root->fs_info, -"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", - root->root_key.objectid, btrfs_ino(inode), - logical_start, - CSUM_FMT_VALUE(csum_size, csum), - CSUM_FMT_VALUE(csum_size, csum_expected), - mirror_num); -} - #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2ba2d8b9cefc..8cae730bd5ba 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -125,6 +125,32 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 ram_bytes, int compress_type, int type); +static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, + u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) +{ + struct btrfs_root *root = inode->root; + const u32 csum_size = root->fs_info->csum_size; + + /* Output without objectid, which is more meaningful */ + if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) { + btrfs_warn_rl(root->fs_info, +"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } else { + btrfs_warn_rl(root->fs_info, +"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d", + root->root_key.objectid, btrfs_ino(inode), + logical_start, + CSUM_FMT_VALUE(csum_size, csum), + CSUM_FMT_VALUE(csum_size, csum_expected), + mirror_num); + } +} + /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * From 956504a331a613814279b04f9b0d663c7c2bb9bc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:37 -0400 Subject: [PATCH 018/249] btrfs: move trans_handle_cachep out of ctree.h This is local to the transaction code, remove it from ctree.h and inode.c, create new helpers in the transaction to handle the init work and move the cachep locally to transaction.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/inode.c | 8 -------- fs/btrfs/super.c | 9 ++++++++- fs/btrfs/transaction.c | 17 +++++++++++++++++ fs/btrfs/transaction.h | 3 +++ 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 39974f4b441f..fec3d618bcef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,7 +41,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_trans_handle_cachep; extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cae730bd5ba..54f50784aefa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -107,7 +107,6 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; struct kmem_cache *btrfs_free_space_bitmap_cachep; @@ -8926,7 +8925,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); @@ -8941,12 +8939,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", - sizeof(struct btrfs_trans_handle), 0, - SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); - if (!btrfs_trans_handle_cachep) - goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", sizeof(struct btrfs_path), 0, SLAB_MEM_SPREAD, NULL); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5942b9384088..8fe2fdb167a7 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2739,10 +2739,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_compress; - err = extent_state_init_cachep(); + err = btrfs_transaction_init(); if (err) goto free_cachep; + err = extent_state_init_cachep(); + if (err) + goto free_transaction; + err = extent_buffer_init_cachep(); if (err) goto free_extent_cachep; @@ -2811,6 +2815,8 @@ free_eb_cachep: extent_buffer_free_cachep(); free_extent_cachep: extent_state_free_cachep(); +free_transaction: + btrfs_transaction_exit(); free_cachep: btrfs_destroy_cachep(); free_compress: @@ -2822,6 +2828,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { + btrfs_transaction_exit(); btrfs_destroy_cachep(); btrfs_delayed_ref_exit(); btrfs_auto_defrag_exit(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d1f1da6820fb..ae7d4aca771d 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -24,6 +24,8 @@ #include "space-info.h" #include "zoned.h" +static struct kmem_cache *btrfs_trans_handle_cachep; + #define BTRFS_ROOT_TRANS_TAG 0 /* @@ -2600,3 +2602,18 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) btrfs_warn(fs_info, "unknown pending changes left 0x%lx, ignoring", prev); } + +int __init btrfs_transaction_init(void) +{ + btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", + sizeof(struct btrfs_trans_handle), 0, + SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL); + if (!btrfs_trans_handle_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_transaction_exit(void) +{ + kmem_cache_destroy(btrfs_trans_handle_cachep); +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 970ff316069d..1332f193b800 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -236,4 +236,7 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); +int __init btrfs_transaction_init(void); +void __cold btrfs_transaction_exit(void); + #endif From 226463d7b100d30def24f2be492fef0081003a30 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:38 -0400 Subject: [PATCH 019/249] btrfs: move btrfs_path_cachep out of ctree.h This is local to the ctree code, remove it from ctree.h and inode.c, create new init/exit functions for the cachep, and move it locally to ctree.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 17 +++++++++++++++++ fs/btrfs/ctree.h | 3 ++- fs/btrfs/inode.c | 8 -------- fs/btrfs/super.c | 9 ++++++++- 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dcb510f38dda..4f2d367e1207 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,8 @@ #include "tree-mod-log.h" #include "tree-checker.h" +static struct kmem_cache *btrfs_path_cachep; + static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4933,3 +4935,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root, } return 1; } + +int __init btrfs_ctree_init(void) +{ + btrfs_path_cachep = kmem_cache_create("btrfs_path", + sizeof(struct btrfs_path), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_path_cachep) + return -ENOMEM; + return 0; +} + +void __cold btrfs_ctree_exit(void) +{ + kmem_cache_destroy(btrfs_path_cachep); +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index fec3d618bcef..282571b54b6d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,7 +41,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_path_cachep; extern struct kmem_cache *btrfs_free_space_cachep; extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; @@ -2662,6 +2661,8 @@ void btrfs_end_write_no_snapshotting(struct btrfs_root *root); void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); /* ctree.c */ +int __init btrfs_ctree_init(void); +void __cold btrfs_ctree_exit(void); int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key, int *slot); int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54f50784aefa..39c474e8a312 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -107,7 +107,6 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; struct kmem_cache *btrfs_free_space_bitmap_cachep; @@ -8925,7 +8924,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_path_cachep); kmem_cache_destroy(btrfs_free_space_cachep); kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } @@ -8939,12 +8937,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_path_cachep = kmem_cache_create("btrfs_path", - sizeof(struct btrfs_path), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_path_cachep) - goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", sizeof(struct btrfs_free_space), 0, SLAB_MEM_SPREAD, NULL); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 8fe2fdb167a7..9f8b77553de0 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2743,10 +2743,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_cachep; - err = extent_state_init_cachep(); + err = btrfs_ctree_init(); if (err) goto free_transaction; + err = extent_state_init_cachep(); + if (err) + goto free_ctree; + err = extent_buffer_init_cachep(); if (err) goto free_extent_cachep; @@ -2815,6 +2819,8 @@ free_eb_cachep: extent_buffer_free_cachep(); free_extent_cachep: extent_state_free_cachep(); +free_ctree: + btrfs_ctree_exit(); free_transaction: btrfs_transaction_exit(); free_cachep: @@ -2828,6 +2834,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { + btrfs_ctree_exit(); btrfs_transaction_exit(); btrfs_destroy_cachep(); btrfs_delayed_ref_exit(); From eda517fd0ceee0329bb956f701c0e124fc1fe269 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:39 -0400 Subject: [PATCH 020/249] btrfs: move free space cachep's out of ctree.h This is local to the free-space-cache.c code, remove it from ctree.h and inode.c, create new init/exit functions for the cachep, and move it locally to free-space-cache.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- fs/btrfs/free-space-cache.c | 28 ++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.h | 2 ++ fs/btrfs/inode.c | 16 ---------------- fs/btrfs/super.c | 9 ++++++++- 5 files changed, 38 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 282571b54b6d..ca5c67b2c025 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -41,8 +41,6 @@ struct btrfs_pending_snapshot; struct btrfs_delayed_ref_root; struct btrfs_space_info; struct btrfs_block_group; -extern struct kmem_cache *btrfs_free_space_cachep; -extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index f4023651dd68..bd68fafcccc7 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,6 +29,9 @@ #define MAX_CACHE_BYTES_PER_GIG SZ_64K #define FORCE_EXTENT_THRESHOLD SZ_1M +static struct kmem_cache *btrfs_free_space_cachep; +static struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_trim_range { u64 start; u64 bytes; @@ -4132,6 +4135,31 @@ out: return ret; } +int __init btrfs_free_space_init(void) +{ + btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", + sizeof(struct btrfs_free_space), 0, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_cachep) + return -ENOMEM; + + btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", + PAGE_SIZE, PAGE_SIZE, + SLAB_MEM_SPREAD, NULL); + if (!btrfs_free_space_bitmap_cachep) { + kmem_cache_destroy(btrfs_free_space_cachep); + return -ENOMEM; + } + + return 0; +} + +void __cold btrfs_free_space_exit(void) +{ + kmem_cache_destroy(btrfs_free_space_cachep); + kmem_cache_destroy(btrfs_free_space_bitmap_cachep); +} + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS /* * Use this if you need to make a bitmap or extent entry specifically, it diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index eaf30f6444dd..cab954a9d97b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -88,6 +88,8 @@ struct btrfs_io_ctl { int bitmaps; }; +int __init btrfs_free_space_init(void); +void __cold btrfs_free_space_exit(void); struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, struct btrfs_path *path); int create_free_space_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 39c474e8a312..50584b93a66f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -107,8 +107,6 @@ static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; -struct kmem_cache *btrfs_free_space_cachep; -struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); @@ -8924,8 +8922,6 @@ void __cold btrfs_destroy_cachep(void) rcu_barrier(); bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); - kmem_cache_destroy(btrfs_free_space_cachep); - kmem_cache_destroy(btrfs_free_space_bitmap_cachep); } int __init btrfs_init_cachep(void) @@ -8937,18 +8933,6 @@ int __init btrfs_init_cachep(void) if (!btrfs_inode_cachep) goto fail; - btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space", - sizeof(struct btrfs_free_space), 0, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_cachep) - goto fail; - - btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", - PAGE_SIZE, PAGE_SIZE, - SLAB_MEM_SPREAD, NULL); - if (!btrfs_free_space_bitmap_cachep) - goto fail; - if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_dio_private, bio), BIOSET_NEED_BVECS)) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9f8b77553de0..0a93fbd29494 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2747,10 +2747,14 @@ static int __init init_btrfs_fs(void) if (err) goto free_transaction; - err = extent_state_init_cachep(); + err = btrfs_free_space_init(); if (err) goto free_ctree; + err = extent_state_init_cachep(); + if (err) + goto free_free_space; + err = extent_buffer_init_cachep(); if (err) goto free_extent_cachep; @@ -2819,6 +2823,8 @@ free_eb_cachep: extent_buffer_free_cachep(); free_extent_cachep: extent_state_free_cachep(); +free_free_space: + btrfs_free_space_exit(); free_ctree: btrfs_ctree_exit(); free_transaction: @@ -2834,6 +2840,7 @@ free_compress: static void __exit exit_btrfs_fs(void) { + btrfs_free_space_exit(); btrfs_ctree_exit(); btrfs_transaction_exit(); btrfs_destroy_cachep(); From 890d2b1aa38b9101503194855aeee897ba6cdbc2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:40 -0400 Subject: [PATCH 021/249] btrfs: move btrfs_next_old_item into ctree.c This uses btrfs_header_nritems, which I will be moving out of ctree.h. In order to avoid needing to include the relevant header in ctree.h, simply move this helper function into ctree.c. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ rename parameters ] Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++++++++ fs/btrfs/ctree.h | 9 +-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4f2d367e1207..5f461e05cc9c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4852,6 +4852,14 @@ done: return ret; } +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq) +{ + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) + return btrfs_next_old_leaf(root, path, time_seq); + return 0; +} + /* * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps * searching until it gets past min_objectid or finds an item of 'type' diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ca5c67b2c025..c8ced88f2143 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2821,14 +2821,7 @@ int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, (path)->slots[0]++ \ ) -static inline int btrfs_next_old_item(struct btrfs_root *root, - struct btrfs_path *p, u64 time_seq) -{ - ++p->slots[0]; - if (p->slots[0] >= btrfs_header_nritems(p->nodes[0])) - return btrfs_next_old_leaf(root, p, time_seq); - return 0; -} +int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq); /* * Search the tree again to find a leaf with greater keys. From 7a66eda351bacf2f88801204cfa5319dda3ec75f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 14 Sep 2022 11:06:41 -0400 Subject: [PATCH 022/249] btrfs: move the btrfs_verity_descriptor_item defs up in ctree.h These are wrapped in CONFIG_FS_VERITY, but we can have the definitions without verity enabled. Move these definitions up with the other accessor helpers. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c8ced88f2143..c0c0dc8fa99f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2523,6 +2523,16 @@ BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, struct btrfs_dev_replace_item, cursor_right, 64); +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(BTRFS_LEAF_DATA_OFFSET + \ @@ -3720,15 +3730,6 @@ extern const struct fsverity_operations btrfs_verityops; int btrfs_drop_verity_items(struct btrfs_inode *inode); int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); -BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, - encryption, 8); -BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, - size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, - struct btrfs_verity_descriptor_item, encryption, 8); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, - struct btrfs_verity_descriptor_item, size, 64); - #else static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) From 765c3fe99bcda005d66c159f9a46fc2e0c77c8ce Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 9 Sep 2022 09:35:01 -0400 Subject: [PATCH 023/249] btrfs: introduce BTRFS_RESERVE_FLUSH_EMERGENCY Inside of FB, as well as some user reports, we've had a consistent problem of occasional ENOSPC transaction aborts. Inside FB we were seeing ~100-200 ENOSPC aborts per day in the fleet, which is a really low occurrence rate given the size of our fleet, but it's not nothing. There are two causes of this particular problem. First is delayed allocation. The reservation system for delalloc assumes that contiguous dirty ranges will result in 1 file extent item. However if there is memory pressure that results in fragmented writeout, or there is fragmentation in the block groups, this won't necessarily be true. Consider the case where we do a single 256MiB write to a file and then close it. We will have 1 reservation for the inode update, the reservations for the checksum updates, and 1 reservation for the file extent item. At some point later we decide to write this entire range out, but we're so fragmented that we break this into 100 different file extents. Since we've already closed the file and are no longer writing to it there's nothing to trigger a refill of the delalloc block rsv to satisfy the 99 new file extent reservations we need. At this point we exhaust our delalloc reservation, and we begin to steal from the global reserve. If you have enough of these cases going in parallel you can easily exhaust the global reserve, get an ENOSPC at btrfs_alloc_tree_block() time, and then abort the transaction. The other case is the delayed refs reserve. The delayed refs reserve updates its size based on outstanding delayed refs and dirty block groups. However we only refill this block reserve when returning excess reservations and when we call btrfs_start_transaction(root, X). We will reserve 2*X credits at transaction start time, and fill in X into the delayed refs reserve to make sure it stays topped off. Generally this works well, but clearly has downsides. If we do a particularly delayed ref heavy operation we may never catch up in our reservations. Additionally running delayed refs generates more delayed refs, and at that point we may be committing the transaction and have no way to trigger a refill of our delayed refs rsv. Then a similar thing occurs with the delalloc reserve. Generally speaking we well over-reserve in all of our block rsvs. If we reserve 1 credit we're usually reserving around 264k of space, but we'll often not use any of that reservation, or use a few blocks of that reservation. We can be reasonably sure that as long as you were able to reserve space up front for your operation you'll be able to find space on disk for that reservation. So introduce a new flushing state, BTRFS_RESERVE_FLUSH_EMERGENCY. This gets used in the case that we've exhausted our reserve and the global reserve. It simply forces a reservation if we have enough actual space on disk to make the reservation, which is almost always the case. This keeps us from hitting ENOSPC aborts in these odd occurrences where we've not kept up with the delayed work. Fixing this in a complete way is going to be relatively complicated and time consuming. This patch is what I discussed with Filipe earlier this year, and what I put into our kernels inside FB. With this patch we're down to 1-2 ENOSPC aborts per week, which is a significant reduction. This is a decent stop gap until we can work out a more wholistic solution to these two corner cases. Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 12 ++++++++++++ fs/btrfs/space-info.c | 29 +++++++++++++++++++++++++++-- fs/btrfs/space-info.h | 18 ++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index ec96285357e0..89e3e7d1bff6 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -552,5 +552,17 @@ try_reserve: if (!ret) return global_rsv; } + + /* + * All hope is lost, but of course our reservations are overly + * pessimistic, so instead of possibly having an ENOSPC abort here, try + * one last time to force a reservation if there's enough actual space + * on disk to make the reservation. + */ + ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize, + BTRFS_RESERVE_FLUSH_EMERGENCY); + if (!ret) + return block_rsv; + return ERR_PTR(ret); } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f171bf875633..af2e133aaa5c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1583,6 +1583,16 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush) flush == BTRFS_RESERVE_FLUSH_EVICT); } +/* + * NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to + * fail as quickly as possible. + */ +static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) +{ + return (flush != BTRFS_RESERVE_NO_FLUSH && + flush != BTRFS_RESERVE_FLUSH_EMERGENCY); +} + /** * Try to reserve bytes from the block_rsv's space * @@ -1644,6 +1654,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ret = 0; } + /* + * Things are dire, we need to make a reservation so we don't abort. We + * will let this reservation go through as long as we have actual space + * left to allocate for the block. + */ + if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) { + used = btrfs_space_info_used(space_info, false); + if (used + orig_bytes <= + writable_total_bytes(fs_info, space_info)) { + btrfs_space_info_update_bytes_may_use(fs_info, space_info, + orig_bytes); + ret = 0; + } + } + /* * If we couldn't make a reservation then setup our reservation ticket * and kick the async worker if it's not already running. @@ -1651,7 +1676,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * If we are a priority flusher then we just need to add our ticket to * the list and we will do our own flushing further down. */ - if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { + if (ret && can_ticket(flush)) { ticket.bytes = orig_bytes; ticket.error = 0; space_info->reclaim_size += ticket.bytes; @@ -1701,7 +1726,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, } } spin_unlock(&space_info->lock); - if (!ret || flush == BTRFS_RESERVE_NO_FLUSH) + if (!ret || !can_ticket(flush)) return ret; return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns, diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 7e17bb803436..f28bd2c051d4 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -48,6 +48,24 @@ enum btrfs_reserve_flush_enum { * Can be interrupted by a fatal signal. */ BTRFS_RESERVE_FLUSH_ALL_STEAL, + + /* + * This is for btrfs_use_block_rsv only. We have exhausted our block + * rsv and our global block rsv. This can happen for things like + * delalloc where we are overwriting a lot of extents with a single + * extent and didn't reserve enough space. Alternatively it can happen + * with delalloc where we reserve 1 extents worth for a large extent but + * fragmentation leads to multiple extents being created. This will + * give us the reservation in the case of + * + * if (num_bytes < (space_info->total_bytes - + * btrfs_space_info_used(space_info, false)) + * + * Which ignores bytes_may_use. This is potentially dangerous, but our + * reservation system is generally pessimistic so is able to absorb this + * style of mistake. + */ + BTRFS_RESERVE_FLUSH_EMERGENCY, }; enum btrfs_flush_state { From ff2b64a22a2efcc087520e94ad06b005268a5f9d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:08 +0800 Subject: [PATCH 024/249] btrfs: raid56: cleanup for function __free_raid_bio() The cleanup involves two things: - Remove the "__" prefix There is no naming confliction. - Remove the forward declaration There is no special function call involved. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 61 +++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 82c8e991300e..371b2a182544 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -69,7 +69,6 @@ static void rmw_work(struct work_struct *work); static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); -static void __free_raid_bio(struct btrfs_raid_bio *rbio); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); @@ -77,6 +76,28 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); static void scrub_parity_work(struct work_struct *work); +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + if (!refcount_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bioc(rbio->bioc); + kfree(rbio); +} + static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { INIT_WORK(&rbio->work, work_func); @@ -336,7 +357,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) spin_unlock(&h->lock); if (freeit) - __free_raid_bio(rbio); + free_raid_bio(rbio); } /* @@ -684,7 +705,7 @@ out: if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) - __free_raid_bio(freeit); + free_raid_bio(freeit); return ret; } @@ -769,28 +790,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void __free_raid_bio(struct btrfs_raid_bio *rbio) -{ - int i; - - if (!refcount_dec_and_test(&rbio->refs)) - return; - - WARN_ON(!list_empty(&rbio->stripe_cache)); - WARN_ON(!list_empty(&rbio->hash_list)); - WARN_ON(!bio_list_empty(&rbio->bio_list)); - - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) { - __free_page(rbio->stripe_pages[i]); - rbio->stripe_pages[i] = NULL; - } - } - - btrfs_put_bioc(rbio->bioc); - kfree(rbio); -} - static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; @@ -830,7 +829,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); - __free_raid_bio(rbio); + free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) @@ -1731,7 +1730,7 @@ static void run_plug(struct btrfs_plug_cb *plug) if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); - __free_raid_bio(cur); + free_raid_bio(cur); continue; } @@ -1822,7 +1821,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) { - __free_raid_bio(rbio); + free_raid_bio(rbio); goto fail; } return; @@ -1839,7 +1838,7 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) } else { ret = __raid56_parity_write(rbio); if (ret) { - __free_raid_bio(rbio); + free_raid_bio(rbio); goto fail; } } @@ -2214,7 +2213,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - __free_raid_bio(rbio); + free_raid_bio(rbio); bio->bi_status = BLK_STS_IOERR; goto out_end_bio; } @@ -2747,7 +2746,7 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) btrfs_warn_rl(fs_info, "can not determine the failed stripe number for full stripe %llu", bioc->raid_map[0]); - __free_raid_bio(rbio); + free_raid_bio(rbio); return NULL; } From 797d74b749850a5b81c47560caec26e00e7e3768 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:09 +0800 Subject: [PATCH 025/249] btrfs: raid56: allocate memory separately for rbio pointers Currently inside alloc_rbio(), we allocate a larger memory to contain the following members: - struct btrfs_raid_rbio itself - stripe_pages array - bio_sectors array - stripe_sectors array - finish_pointers array Then update rbio pointers to point the extra space after the rbio structure itself. Thus it introduced a complex CONSUME_ALLOC() macro to help the thing. This is too hacky, and is going to make later pointers expansion harder. This patch will change it to use regular kcalloc() for each pointer inside btrfs_raid_bio, making the later expansion much easier. And introduce a helper free_raid_bio_pointers() to free up all the pointer members in btrfs_raid_bio, which will be used in both free_raid_bio() and error path of alloc_rbio(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 371b2a182544..4ec211a58f15 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -76,6 +76,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); static void scrub_parity_work(struct work_struct *work); +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + kfree(rbio->stripe_pages); + kfree(rbio->bio_sectors); + kfree(rbio->stripe_sectors); + kfree(rbio->finish_pointers); +} + static void free_raid_bio(struct btrfs_raid_bio *rbio) { int i; @@ -95,6 +103,7 @@ static void free_raid_bio(struct btrfs_raid_bio *rbio) } btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); kfree(rbio); } @@ -918,7 +927,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - void *p; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); @@ -928,14 +936,23 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); - rbio = kzalloc(sizeof(*rbio) + - sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_sectors) * num_sectors + - sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes, - GFP_NOFS); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + !rbio->finish_pointers) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } bio_list_init(&rbio->bio_list); INIT_LIST_HEAD(&rbio->plug_list); @@ -955,21 +972,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); - /* - * The stripe_pages, bio_sectors, etc arrays point to the extra memory - * we allocated past the end of the rbio. - */ - p = rbio + 1; -#define CONSUME_ALLOC(ptr, count) do { \ - ptr = p; \ - p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ - } while (0) - CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_sectors, num_sectors); - CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); - CONSUME_ALLOC(rbio->finish_pointers, real_stripes); -#undef CONSUME_ALLOC - ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); From 88074c8b1376ac315ef4a294db82d861df074ef2 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 10 Oct 2022 18:36:10 +0800 Subject: [PATCH 026/249] btrfs: raid56: make it more explicit that cache rbio should have all its data sectors uptodate For Btrfs RAID56, we have a caching system for btrfs raid bios (rbio). We call cache_rbio_pages() to mark a qualified rbio ready for cache. The timing happens at: - finish_rmw() At this timing, we have already read all necessary sectors, along with the rbio sectors, we have covered all data stripes. - __raid_recover_end_io() At this timing, we have rebuild the rbio, thus all data sectors involved (either from stripe or bio list) are uptodate now. Thus at the timing of cache_rbio_pages(), we should have all data sectors uptodate. This patch will make it explicit that all data sectors are uptodate at cache_rbio_pages() timing, mostly to prepare for the incoming verification at RMW time. This patch will add: - Extra ASSERT()s in cache_rbio_pages() This is to make sure all data sectors, which are not covered by bio, are already uptodate. - Extra ASSERT()s in steal_rbio() Since only cached rbio can be stolen, thus every data sector should already be uptodate in the source rbio. - Update __raid_recover_end_io() to update recovered sector->uptodate Previously __raid_recover_end_io() will only mark failed sectors uptodate if it's doing an RMW. But this can trigger new ASSERT()s, as for recovery case, a recovered failed sector will not be marked uptodate, and trigger ASSERT() in later cache_rbio_pages() call. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 70 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 4ec211a58f15..c009c0a2081e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -176,8 +176,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) + if (!rbio->bio_sectors[i].page) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(rbio->stripe_sectors[i].uptodate); continue; + } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, @@ -264,6 +272,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, dest->stripe_sectors[i].uptodate = true; } +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = (page_nr << PAGE_SHIFT) >> + rbio->bioc->fs_info->sectorsize_bits; + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -274,16 +297,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; - struct page *s; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { - s = src->stripe_pages[i]; - if (!s || !full_page_sectors_uptodate(src, i)) + struct page *p = src->stripe_pages[i]; + + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) continue; + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); @@ -2003,22 +2036,21 @@ pstripe: /* xor in the rest */ run_xor(pointers, rbio->nr_data - 1, sectorsize); } - /* if we're doing this rebuild as part of an rmw, go through - * and set all of our private rbio pages in the - * failed stripes as uptodate. This way finish_rmw will - * know they can be trusted. If this was a read reconstruction, - * other endio functions will fiddle the uptodate bits + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired, thus they are now uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. */ - if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } + for (i = 0; i < rbio->stripe_nsectors; i++) { + if (faila != -1) { + sector = rbio_stripe_sector(rbio, faila, i); + sector->uptodate = 1; + } + if (failb != -1) { + sector = rbio_stripe_sector(rbio, failb, i); + sector->uptodate = 1; } } for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) From d47704bd1c78c85831561bcf701b90dd66f811b2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:54 +0100 Subject: [PATCH 027/249] btrfs: get the next extent map during fiemap/lseek more efficiently At find_delalloc_subrange(), when we need to get the next extent map, we do a full search on the extent map tree (a red black tree). This is fine but it's a lot more efficient to simply use rb_next(), which typically requires iterating over less nodes of the tree and never needs to compare the ranges of nodes with the one we are looking for. So add a public helper to extent_map.{h,c} to get the extent map that immediately follows another extent map, using rb_next(), and use that helper at find_delalloc_subrange(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 31 +++++++++++++++++++++++++++++- fs/btrfs/extent_map.h | 2 ++ fs/btrfs/file.c | 44 ++++++++++++++++++++++++++----------------- 3 files changed, 59 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6092a4eedc92..715979807ae1 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -523,7 +523,7 @@ void replace_extent_mapping(struct extent_map_tree *tree, setup_extent_mapping(tree, new, modified); } -static struct extent_map *next_extent_map(struct extent_map *em) +static struct extent_map *next_extent_map(const struct extent_map *em) { struct rb_node *next; @@ -533,6 +533,35 @@ static struct extent_map *next_extent_map(struct extent_map *em) return container_of(next, struct extent_map, rb_node); } +/* + * Get the extent map that immediately follows another one. + * + * @tree: The extent map tree that the extent map belong to. + * Holding read or write access on the tree's lock is required. + * @em: An extent map from the given tree. The caller must ensure that + * between getting @em and between calling this function, the + * extent map @em is not removed from the tree - for example, by + * holding the tree's lock for the duration of those 2 operations. + * + * Returns the extent map that immediately follows @em, or NULL if @em is the + * last extent map in the tree. + */ +struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, + const struct extent_map *em) +{ + struct extent_map *next; + + /* The lock must be acquired either in read mode or write mode. */ + lockdep_assert_held(&tree->lock); + ASSERT(extent_map_in_tree(em)); + + next = next_extent_map(em); + if (next) + refcount_inc(&next->refs); + + return next; +} + static struct extent_map *prev_extent_map(struct extent_map *em) { struct rb_node *prev; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index ad311864272a..68d3f2c9ea1d 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -87,6 +87,8 @@ static inline u64 extent_map_block_end(struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); +struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, + const struct extent_map *em); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 493cae66e5e6..6685cb0e928c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3569,40 +3569,50 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); - read_unlock(&em_tree->lock); + if (!em) { + read_unlock(&em_tree->lock); + return (delalloc_len > 0); + } /* extent_map_end() returns a non-inclusive end offset. */ - em_end = em ? extent_map_end(em) : 0; + em_end = extent_map_end(em); /* * If we have a hole/prealloc extent map, check the next one if this one * ends before our range's end. */ - if (em && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { + if ((em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { struct extent_map *next_em; - read_lock(&em_tree->lock); - next_em = lookup_extent_mapping(em_tree, em_end, len - em_end); - read_unlock(&em_tree->lock); - + next_em = btrfs_next_extent_map(em_tree, em); free_extent_map(em); - em_end = next_em ? extent_map_end(next_em) : 0; + + /* + * There's no next extent map or the next one starts beyond our + * range, return the range found in the io tree (if any). + */ + if (!next_em || next_em->start > end) { + read_unlock(&em_tree->lock); + free_extent_map(next_em); + return (delalloc_len > 0); + } + + em_end = extent_map_end(next_em); em = next_em; } - if (em && (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - free_extent_map(em); - em = NULL; - } + read_unlock(&em_tree->lock); /* - * No extent map or one for a hole or prealloc extent. Use the delalloc - * range we found in the io tree if we have one. + * We have a hole or prealloc extent that ends at or beyond our range's + * end, return the range found in the io tree (if any). */ - if (!em) + if (em->block_start == EXTENT_MAP_HOLE || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + free_extent_map(em); return (delalloc_len > 0); + } /* * We don't have any range as EXTENT_DELALLOC in the io tree, so the From 013f9c70d293d7a47aaaeaee248ca60f745fa450 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:55 +0100 Subject: [PATCH 028/249] btrfs: skip unnecessary extent map searches during fiemap and lseek If we have no outstanding extents it means we don't have any extent maps corresponding to delalloc that is flushing, as when an ordered extent is created we increment the number of outstanding extents to 1 and when we remove the ordered extent we decrement them by 1. So skip extent map tree searches if the number of outstanding ordered extents is 0, saving time as the tree is not empty if we have previously made some reads or flushed delalloc, as in those cases it can have a very large number of extent maps for files with many extents. This helps save time when processing a file range corresponding to a hole or prealloc (unwritten) extent. The next patch in the series has a performance test in its changelog and its subject is: "btrfs: skip unnecessary delalloc search during fiemap and lseek" Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6685cb0e928c..1cf97280760e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3553,6 +3553,18 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (delalloc_len > 0) *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + spin_lock(&inode->lock); + if (inode->outstanding_extents == 0) { + /* + * No outstanding extents means we don't have any delalloc that + * is flushing, so return the unflushed range found in the io + * tree (if any). + */ + spin_unlock(&inode->lock); + return (delalloc_len > 0); + } + spin_unlock(&inode->lock); + /* * Now also check if there's any extent map in the range that does not * map to a hole or prealloc extent. We do this because: From a2853ffc2eb9bfb6e6d48486df7f3969a58ae3b4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:56 +0100 Subject: [PATCH 029/249] btrfs: skip unnecessary delalloc search during fiemap and lseek During fiemap and lseek (hole and data seeking), there's no point in iterating the inode's io tree to count delalloc bits if the inode's delalloc bytes counter has a value of zero, as that counter is updated whenever we set a range for delalloc or clear a range from delalloc. So skip the counting and io tree iteration if the inode's delalloc bytes counter has a value of zero. This helps save time when processing a file range corresponding to a hole or prealloc (unwritten) extent. This patch is part of a series comprised of the following patches: btrfs: get the next extent map during fiemap/lseek more efficiently btrfs: skip unnecessary extent map searches during fiemap and lseek btrfs: skip unnecessary delalloc search during fiemap and lseek The following test was performed on a release kernel (Debian's default kernel config) before and after applying those 3 patches. # Wrapper to call fiemap in extent count only mode. # (struct fiemap::fm_extent_count set to 0) $ cat fiemap.c #include #include #include #include #include #include #include #include #include int main(int argc, char **argv) { struct fiemap fiemap = { 0 }; int fd; if (argc != 2) { printf("usage: %s \n", argv[0]); return 1; } fd = open(argv[1], O_RDONLY); if (fd < 0) { fprintf(stderr, "error opening file: %s\n", strerror(errno)); return 1; } /* fiemap.fm_extent_count set to 0, to count extents only. */ fiemap.fm_length = FIEMAP_MAX_OFFSET; if (ioctl(fd, FS_IOC_FIEMAP, &fiemap) < 0) { fprintf(stderr, "fiemap error: %s\n", strerror(errno)); return 1; } close(fd); printf("fm_mapped_extents = %d\n", fiemap.fm_mapped_extents); return 0; } $ gcc -o fiemap fiemap.c And the wrapper shell script that creates a file with many holes and runs fiemap against it: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1 * 1024 * 1024 * 1024)) echo -n > $MNT/foobar for ((off = 0; off < $FILE_SIZE; off += 8192)); do xfs_io -c "pwrite -S 0xab $off 4K" $MNT/foobar > /dev/null done # flush all delalloc sync start=$(date +%s%N) ./fiemap $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds" umount $MNT Result before applying patchset: fm_mapped_extents = 131072 fiemap took 63 milliseconds Result after applying patchset: fm_mapped_extents = 131072 fiemap took 39 milliseconds (-38.1%) Running the same test for a 512M file instead of a 1G file, gave the following results. Result before applying patchset: fm_mapped_extents = 65536 fiemap took 29 milliseconds Result after applying patchset: fm_mapped_extents = 65536 fiemap took 20 milliseconds (-31.0%) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 1cf97280760e..19b41b5fe6c0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3537,15 +3537,27 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end struct extent_map *em; u64 em_end; u64 delalloc_len; + unsigned int outstanding_extents; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ - *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); + spin_lock(&inode->lock); + outstanding_extents = inode->outstanding_extents; + + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1); + } else { + spin_unlock(&inode->lock); + delalloc_len = 0; + } + /* * If delalloc was found then *delalloc_start_ret has a sector size * aligned value (rounded down). @@ -3553,17 +3565,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (delalloc_len > 0) *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; - spin_lock(&inode->lock); - if (inode->outstanding_extents == 0) { - /* - * No outstanding extents means we don't have any delalloc that - * is flushing, so return the unflushed range found in the io - * tree (if any). - */ - spin_unlock(&inode->lock); + /* + * No outstanding extents means we don't have any delalloc that is + * flushing, so return the unflushed range found in the io tree (if any). + */ + if (outstanding_extents == 0) return (delalloc_len > 0); - } - spin_unlock(&inode->lock); /* * Now also check if there's any extent map in the range that does not From b98c6cd59e90fe659cf966859bc4e1c03aea347b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:57 +0100 Subject: [PATCH 030/249] btrfs: drop pointless memset when cloning extent buffer At btrfs_clone_extent_buffer(), before allocating the pages array for the new extent buffer we are calling memset() to zero out the pages array of the extent buffer. This is pointless however, because the extent buffer already has every element in its pages array pointing to NULL, as it was allocated with kmem_cache_zalloc(). The memset() was introduced with commit dd137dd1f2d719 ("btrfs: factor out allocating an array of pages"), but even before that commit we already depended on the pages array being initialized to NULL for the error paths that need to call btrfs_release_extent_buffer(). So remove the memset(), it's useless and slightly increases the object text size. Before this change: $ size fs/btrfs/extent_io.o text data bss dec hex filename 70580 5469 40 76089 12939 fs/btrfs/extent_io.o After this change: $ size fs/btrfs/extent_io.o text data bss dec hex filename 70564 5469 40 76073 12929 fs/btrfs/extent_io.o Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7891d375eb43..56a9f591f479 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4299,7 +4299,6 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - memset(new->pages, 0, sizeof(*new->pages) * num_pages); ret = btrfs_alloc_page_array(num_pages, new->pages); if (ret) { btrfs_release_extent_buffer(new); From 206c1d32f381ee91ba849a7dcb28728e8c3721b6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:58 +0100 Subject: [PATCH 031/249] btrfs: drop redundant bflags initialization when allocating extent buffer When allocating an extent buffer, at __alloc_extent_buffer(), there's no point in explicitly assigning zero to the bflags field of the new extent buffer because we allocated it with kmem_cache_zalloc(). So just remove the redundant initialization, it saves one mov instruction in the generated assembly code for x86_64 ("movq $0x0,0x10(%rax)"). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 56a9f591f479..3192f094fe6c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4266,7 +4266,6 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, eb->start = start; eb->len = len; eb->fs_info = fs_info; - eb->bflags = 0; init_rwsem(&eb->lock); btrfs_leak_debug_add_eb(eb); From c902421927ff602954d2ecc7bd6e71b255d29387 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:16:59 +0100 Subject: [PATCH 032/249] btrfs: remove checks for a root with id 0 during backref walking When doing backref walking to determine if an extent is shared, we are testing the root_objectid of the given share_check struct is 0, but that is an impossible case, since btrfs_is_data_extent_shared() always initializes the root_objectid field with the id of the given root, and no root can have an objectid of 0. So remove those checks. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 18374a6d05bd..abac2e942e8b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -719,8 +719,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; } - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root_objectid) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -1348,8 +1347,7 @@ again: * and would retain their original ref->count < 0. */ if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (sc && sc->root_objectid && - ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root_objectid) { ret = BACKREF_FOUND_SHARED; goto out; } From a0a5472ad802d99d3fb4b361cc3fb5ea24914ee0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:00 +0100 Subject: [PATCH 033/249] btrfs: remove checks for a 0 inode number during backref walking When doing backref walking to determine if an extent is shared, we are testing if the inode number, stored in the 'inum' field of struct share_check, is 0. However that can never be case, since the all instances of the structure are created at btrfs_is_data_extent_shared(), which always initializes it with the inode number from a fs tree (and the number for any inode from any tree can never be 0). So remove the checks. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index abac2e942e8b..9c7b3e1e4762 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1051,7 +1051,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum && + if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; @@ -1152,7 +1152,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); - if (sc && sc->inum && key.objectid != sc->inum && + if (sc && key.objectid != sc->inum && !sc->have_delayed_delete_refs) { ret = BACKREF_FOUND_SHARED; break; From ceb707da9ad92ad3a5251dc13844034ded06cb3d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:01 +0100 Subject: [PATCH 034/249] btrfs: directly pass the inode to btrfs_is_data_extent_shared() Currently we pass a root and an inode number as arguments for btrfs_is_data_extent_shared() and the inode number is always from an inode that belongs to that root (it wouldn't make sense otherwise). In every context that we call btrfs_is_data_extent_shared() (fiemap only), we have an inode available, so directly pass the inode to the function instead of a root and inode number. This reduces the number of parameters and it makes the function's signature conform to most other functions we have. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 8 ++++---- fs/btrfs/backref.h | 2 +- fs/btrfs/extent_io.c | 16 +++++++--------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9c7b3e1e4762..efdeb69e8ed6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1658,8 +1658,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, /* * Check if a data extent is shared or not. * - * @root: The root the inode belongs to. - * @inum: Number of the inode whose extent we are checking. + * @inode: The inode whose extent we are checking. * @bytenr: Logical bytenr of the extent we are checking. * @extent_gen: Generation of the extent (file extent item) or 0 if it is * not known. @@ -1678,11 +1677,12 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, * * Return: 0 if extent is not shared, 1 if it is shared, < 0 on error. */ -int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, struct btrfs_backref_shared_cache *cache) { + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; struct ulist_iterator uiter; @@ -1691,7 +1691,7 @@ int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, int ret = 0; struct share_check shared = { .root_objectid = root->root_key.objectid, - .inum = inum, + .inum = btrfs_ino(inode), .share_count = 0, .have_delayed_delete_refs = false, }; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8e69584d538d..c846fa2bdf93 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -77,7 +77,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); -int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr, +int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, struct btrfs_backref_shared_cache *cache); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3192f094fe6c..83be7e85ba91 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3712,7 +3712,6 @@ static int fiemap_process_hole(struct btrfs_inode *inode, u64 start, u64 end) { const u64 i_size = i_size_read(&inode->vfs_inode); - const u64 ino = btrfs_ino(inode); u64 cur_offset = start; u64 last_delalloc_end = 0; u32 prealloc_flags = FIEMAP_EXTENT_UNWRITTEN; @@ -3752,8 +3751,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode->root, - ino, disk_bytenr, + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, extent_gen, roots, tmp_ulist, backref_cache); @@ -3802,8 +3801,8 @@ static int fiemap_process_hole(struct btrfs_inode *inode, } if (!checked_extent_shared && fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(inode->root, - ino, disk_bytenr, + ret = btrfs_is_data_extent_shared(inode, + disk_bytenr, extent_gen, roots, tmp_ulist, backref_cache); @@ -3904,7 +3903,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct btrfs_backref_shared_cache *backref_cache; struct ulist *roots; @@ -3925,8 +3923,8 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out; } - lockstart = round_down(start, root->fs_info->sectorsize); - lockend = round_up(start + len, root->fs_info->sectorsize); + lockstart = round_down(start, inode->root->fs_info->sectorsize); + lockend = round_up(start + len, inode->root->fs_info->sectorsize); prev_extent_end = lockstart; lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); @@ -4034,7 +4032,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } else { /* We have a regular extent. */ if (fieinfo->fi_extents_max) { - ret = btrfs_is_data_extent_shared(root, ino, + ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, roots, From 61dbb952f0a5f587c983d88853212e969d2d4ede Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:02 +0100 Subject: [PATCH 035/249] btrfs: turn the backref sharedness check cache into a context object Right now we are using a struct btrfs_backref_shared_cache to pass state across multiple btrfs_is_data_extent_shared() calls. The structure's name closely follows its current purpose, which is to cache previous checks for the sharedness of metadata extents. However we will start using the structure for more things other than caching sharedness checks, so rename it to struct btrfs_backref_share_check_ctx. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 32 ++++++++++++++++---------------- fs/btrfs/backref.h | 8 ++++---- fs/btrfs/extent_io.c | 24 ++++++++++++------------ 3 files changed, 32 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index efdeb69e8ed6..693c99ad4afb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1542,13 +1542,13 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, * fs_info->commit_root_sem semaphore, so no need to worry about the root's last * snapshot field changing while updating or checking the cache. */ -static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache, +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, struct btrfs_root *root, u64 bytenr, int level, bool *is_shared) { struct btrfs_backref_shared_cache_entry *entry; - if (!cache->use_cache) + if (!ctx->use_path_cache) return false; if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) @@ -1562,7 +1562,7 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache */ ASSERT(level >= 0); - entry = &cache->entries[level]; + entry = &ctx->path_cache_entries[level]; /* Unused cache entry or being used for some other extent buffer. */ if (entry->bytenr != bytenr) @@ -1595,8 +1595,8 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache */ if (*is_shared) { for (int i = 0; i < level; i++) { - cache->entries[i].is_shared = true; - cache->entries[i].gen = entry->gen; + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; } } @@ -1608,14 +1608,14 @@ static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache * fs_info->commit_root_sem semaphore, so no need to worry about the root's last * snapshot field changing while updating or checking the cache. */ -static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, struct btrfs_root *root, u64 bytenr, int level, bool is_shared) { struct btrfs_backref_shared_cache_entry *entry; u64 gen; - if (!cache->use_cache) + if (!ctx->use_path_cache) return; if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) @@ -1634,7 +1634,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, else gen = btrfs_root_last_snapshot(&root->root_item); - entry = &cache->entries[level]; + entry = &ctx->path_cache_entries[level]; entry->bytenr = bytenr; entry->is_shared = is_shared; entry->gen = gen; @@ -1648,7 +1648,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, */ if (is_shared) { for (int i = 0; i < level; i++) { - entry = &cache->entries[i]; + entry = &ctx->path_cache_entries[i]; entry->is_shared = is_shared; entry->gen = gen; } @@ -1664,7 +1664,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, * not known. * @roots: List of roots this extent is shared among. * @tmp: Temporary list used for iteration. - * @cache: A backref lookup result cache. + * @ctx: A backref sharedness check context. * * btrfs_is_data_extent_shared uses the backref walking code but will short * circuit as soon as it finds a root or inode that doesn't match the @@ -1680,7 +1680,7 @@ static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache, int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, - struct btrfs_backref_shared_cache *cache) + struct btrfs_backref_share_check_ctx *ctx) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -1715,7 +1715,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); - cache->use_cache = true; + ctx->use_path_cache = true; while (1) { bool is_shared; bool cached; @@ -1726,7 +1726,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, /* this is the only condition under which we return 1 */ ret = 1; if (level >= 0) - store_backref_shared_cache(cache, root, bytenr, + store_backref_shared_cache(ctx, root, bytenr, level, true); break; } @@ -1761,17 +1761,17 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, * (which implies multiple paths). */ if (level == -1 && tmp->nnodes > 1) - cache->use_cache = false; + ctx->use_path_cache = false; if (level >= 0) - store_backref_shared_cache(cache, root, bytenr, + store_backref_shared_cache(ctx, root, bytenr, level, false); node = ulist_next(tmp, &uiter); if (!node) break; bytenr = node->val; level++; - cached = lookup_backref_shared_cache(cache, root, bytenr, level, + cached = lookup_backref_shared_cache(ctx, root, bytenr, level, &is_shared); if (cached) { ret = (is_shared ? 1 : 0); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index c846fa2bdf93..e3a2b45a76e3 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,13 +23,13 @@ struct btrfs_backref_shared_cache_entry { bool is_shared; }; -struct btrfs_backref_shared_cache { +struct btrfs_backref_share_check_ctx { /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. */ - struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL]; - bool use_cache; + struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; + bool use_path_cache; }; typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, @@ -80,7 +80,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct ulist *roots, struct ulist *tmp, - struct btrfs_backref_shared_cache *cache); + struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); void __cold btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 83be7e85ba91..365ad00a5942 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3705,7 +3705,7 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path static int fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, - struct btrfs_backref_shared_cache *backref_cache, + struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, struct ulist *roots, struct ulist *tmp_ulist, @@ -3755,7 +3755,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, disk_bytenr, extent_gen, roots, tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3805,7 +3805,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, disk_bytenr, extent_gen, roots, tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3904,7 +3904,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct extent_state *cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; - struct btrfs_backref_shared_cache *backref_cache; + struct btrfs_backref_share_check_ctx *backref_ctx; struct ulist *roots; struct ulist *tmp_ulist; u64 last_extent_end; @@ -3914,11 +3914,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; - backref_cache = kzalloc(sizeof(*backref_cache), GFP_KERNEL); + backref_ctx = kzalloc(sizeof(*backref_ctx), GFP_KERNEL); path = btrfs_alloc_path(); roots = ulist_alloc(GFP_KERNEL); tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!backref_cache || !path || !roots || !tmp_ulist) { + if (!backref_ctx || !path || !roots || !tmp_ulist) { ret = -ENOMEM; goto out; } @@ -3978,7 +3978,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, const u64 range_end = min(key.offset, lockend) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, 0, 0, 0, + backref_ctx, 0, 0, 0, roots, tmp_ulist, prev_extent_end, range_end); if (ret < 0) { @@ -4019,14 +4019,14 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, + backref_ctx, disk_bytenr, extent_offset, extent_gen, roots, tmp_ulist, key.offset, extent_end - 1); } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, - backref_cache, 0, 0, 0, + backref_ctx, 0, 0, 0, roots, tmp_ulist, key.offset, extent_end - 1); } else { @@ -4037,7 +4037,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, roots, tmp_ulist, - backref_cache); + backref_ctx); if (ret < 0) goto out_unlock; else if (ret > 0) @@ -4086,7 +4086,7 @@ check_eof_delalloc: path = NULL; if (!stopped && prev_extent_end < lockend) { - ret = fiemap_process_hole(inode, fieinfo, &cache, backref_cache, + ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, 0, 0, 0, roots, tmp_ulist, prev_extent_end, lockend - 1); if (ret < 0) @@ -4119,7 +4119,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: - kfree(backref_cache); + kfree(backref_ctx); btrfs_free_path(path); ulist_free(roots); ulist_free(tmp_ulist); From 84a7949d409753c90dc3477b8cfc18e983b09078 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:03 +0100 Subject: [PATCH 036/249] btrfs: move ulists to data extent sharedness check context When calling btrfs_is_data_extent_shared() we pass two ulists that were allocated by the caller. This is because the single caller, fiemap, calls btrfs_is_data_extent_shared() multiple times and the ulists can be reused, instead of allocating new ones before each call and freeing them after each call. Now that we have a context structure/object that we pass to btrfs_is_data_extent_shared(), we can move those ulists to it, and hide their allocation and the context's allocation in a helper function, as well as the freeing of the ulists and the context object. This allows to reduce the number of parameters passed to btrfs_is_data_extent_shared(), the need to pass the ulists from extent_fiemap() to fiemap_process_hole() and having the caller deal with allocating and releasing the ulists. Also rename one of the ulists from 'tmp' / 'tmp_ulist' to 'refs', since that's a much better name as it reflects what the list is used for (and matching the argument name for find_parent_nodes()). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 43 ++++++++++++++++++++++++++++++++----------- fs/btrfs/backref.h | 7 ++++++- fs/btrfs/extent_io.c | 34 ++++++++++------------------------ 3 files changed, 48 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 693c99ad4afb..4caff3052da7 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1655,6 +1655,30 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx } } +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) +{ + struct btrfs_backref_share_check_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + ulist_init(&ctx->refs); + ulist_init(&ctx->roots); + + return ctx; +} + +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) +{ + if (!ctx) + return; + + ulist_release(&ctx->refs); + ulist_release(&ctx->roots); + kfree(ctx); +} + /* * Check if a data extent is shared or not. * @@ -1662,8 +1686,6 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx * @bytenr: Logical bytenr of the extent we are checking. * @extent_gen: Generation of the extent (file extent item) or 0 if it is * not known. - * @roots: List of roots this extent is shared among. - * @tmp: Temporary list used for iteration. * @ctx: A backref sharedness check context. * * btrfs_is_data_extent_shared uses the backref walking code but will short @@ -1679,7 +1701,6 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx */ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, - struct ulist *roots, struct ulist *tmp, struct btrfs_backref_share_check_ctx *ctx) { struct btrfs_root *root = inode->root; @@ -1697,8 +1718,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, }; int level; - ulist_init(roots); - ulist_init(tmp); + ulist_init(&ctx->roots); + ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); if (IS_ERR(trans)) { @@ -1720,8 +1741,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool is_shared; bool cached; - ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, - roots, NULL, &shared, false); + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, + &ctx->roots, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1760,13 +1781,13 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, * deal with), we can not use it if we have multiple leaves * (which implies multiple paths). */ - if (level == -1 && tmp->nnodes > 1) + if (level == -1 && ctx->refs.nnodes > 1) ctx->use_path_cache = false; if (level >= 0) store_backref_shared_cache(ctx, root, bytenr, level, false); - node = ulist_next(tmp, &uiter); + node = ulist_next(&ctx->refs, &uiter); if (!node) break; bytenr = node->val; @@ -1789,8 +1810,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, up_read(&fs_info->commit_root_sem); } out: - ulist_release(roots); - ulist_release(tmp); + ulist_release(&ctx->roots); + ulist_release(&ctx->refs); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index e3a2b45a76e3..8da0ba6b94a4 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -24,6 +24,9 @@ struct btrfs_backref_shared_cache_entry { }; struct btrfs_backref_share_check_ctx { + /* Ulists used during backref walking. */ + struct ulist refs; + struct ulist roots; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. @@ -35,6 +38,9 @@ struct btrfs_backref_share_check_ctx { typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, void *ctx); +struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); +void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); + int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_path *path, struct btrfs_key *found_key, u64 *flags); @@ -79,7 +85,6 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 *found_off); int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, - struct ulist *roots, struct ulist *tmp, struct btrfs_backref_share_check_ctx *ctx); int __init btrfs_prelim_ref_init(void); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 365ad00a5942..e25e54d2216f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3708,7 +3708,6 @@ static int fiemap_process_hole(struct btrfs_inode *inode, struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, - struct ulist *roots, struct ulist *tmp_ulist, u64 start, u64 end) { const u64 i_size = i_size_read(&inode->vfs_inode); @@ -3752,10 +3751,9 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (prealloc_len > 0) { if (!checked_extent_shared && fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, - disk_bytenr, - extent_gen, roots, - tmp_ulist, - backref_ctx); + disk_bytenr, + extent_gen, + backref_ctx); if (ret < 0) return ret; else if (ret > 0) @@ -3803,8 +3801,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, if (!checked_extent_shared && fieinfo->fi_extents_max) { ret = btrfs_is_data_extent_shared(inode, disk_bytenr, - extent_gen, roots, - tmp_ulist, + extent_gen, backref_ctx); if (ret < 0) return ret; @@ -3905,8 +3902,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; - struct ulist *roots; - struct ulist *tmp_ulist; u64 last_extent_end; u64 prev_extent_end; u64 lockstart; @@ -3914,11 +3909,9 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; - backref_ctx = kzalloc(sizeof(*backref_ctx), GFP_KERNEL); + backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); - roots = ulist_alloc(GFP_KERNEL); - tmp_ulist = ulist_alloc(GFP_KERNEL); - if (!backref_ctx || !path || !roots || !tmp_ulist) { + if (!backref_ctx || !path) { ret = -ENOMEM; goto out; } @@ -3979,7 +3972,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, 0, 0, 0, - roots, tmp_ulist, prev_extent_end, range_end); if (ret < 0) { goto out_unlock; @@ -4021,13 +4013,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, disk_bytenr, extent_offset, - extent_gen, roots, tmp_ulist, - key.offset, extent_end - 1); + extent_gen, key.offset, + extent_end - 1); } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, 0, 0, 0, - roots, tmp_ulist, key.offset, extent_end - 1); } else { /* We have a regular extent. */ @@ -4035,8 +4026,6 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, ret = btrfs_is_data_extent_shared(inode, disk_bytenr, extent_gen, - roots, - tmp_ulist, backref_ctx); if (ret < 0) goto out_unlock; @@ -4087,8 +4076,7 @@ check_eof_delalloc: if (!stopped && prev_extent_end < lockend) { ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, - 0, 0, 0, roots, tmp_ulist, - prev_extent_end, lockend - 1); + 0, 0, 0, prev_extent_end, lockend - 1); if (ret < 0) goto out_unlock; prev_extent_end = lockend; @@ -4119,10 +4107,8 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: - kfree(backref_ctx); + btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); - ulist_free(roots); - ulist_free(tmp_ulist); return ret; } From b629685803bc0cefbbd29240ea28d57d8a17bcbc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:04 +0100 Subject: [PATCH 037/249] btrfs: remove roots ulist when checking data extent sharedness Currently btrfs_is_data_extent_shared() is passing a ulist for the roots argument of find_parent_nodes(), however it does not use that ulist for anything and for this context that list always ends up with at most one element. Since find_parent_nodes() is able to deal with a NULL ulist for its roots argument, make btrfs_is_data_extent_shared() pass it NULL and avoid the burden of allocating memory for the unnused roots ulist, initializing it, releasing it and allocating one struct ulist_node for it during the call to find_parent_nodes(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 6 +----- fs/btrfs/backref.h | 1 - 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 4caff3052da7..4d8573fa75b4 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1664,7 +1664,6 @@ struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) return NULL; ulist_init(&ctx->refs); - ulist_init(&ctx->roots); return ctx; } @@ -1675,7 +1674,6 @@ void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx) return; ulist_release(&ctx->refs); - ulist_release(&ctx->roots); kfree(ctx); } @@ -1718,7 +1716,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, }; int level; - ulist_init(&ctx->roots); ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); @@ -1742,7 +1739,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool cached; ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, - &ctx->roots, NULL, &shared, false); + NULL, NULL, &shared, false); if (ret == BACKREF_FOUND_SHARED) { /* this is the only condition under which we return 1 */ ret = 1; @@ -1810,7 +1807,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, up_read(&fs_info->commit_root_sem); } out: - ulist_release(&ctx->roots); ulist_release(&ctx->refs); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8da0ba6b94a4..5f468f0defda 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -26,7 +26,6 @@ struct btrfs_backref_shared_cache_entry { struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; - struct ulist roots; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. From 56f5c19920d09bbf91efcf80e6ba301923400f4c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:05 +0100 Subject: [PATCH 038/249] btrfs: remove useless logic when finding parent nodes At find_parent_nodes(), at its last step, when iterating over all direct references, we are checking if we have a share context and if we have a reference with a different root from the one in the share context. However that logic is pointless because of two reasons: 1) After the previous patch in the series (subject "btrfs: remove roots ulist when checking data extent sharedness"), the roots argument is always NULL when using a share check context (struct share_check), so this code is never triggered; 2) Even before that previous patch, we could not hit this code because if we had a reference with a root different from the one in our share context, then we would have exited earlier when doing either of the following: - Adding a second direct ref to the direct refs red black tree resulted in extent_is_shared() returning true when called from add_direct_ref() -> add_prelim_ref(), after processing delayed references or while processing references in the extent tree; - When adding a second reference to the indirect refs red black tree (same as above, extent_is_shared() returns true); - If we only have one indirect reference and no direct references, then when resolving it at resolve_indirect_refs() we immediately return that the target extent is shared, therefore never reaching that loop that iterates over all direct references at find_parent_nodes(); - If we have 1 indirect reference and 1 direct reference, then we also exit early because extent_is_shared() ends up returning true when called through add_prelim_ref() (by add_direct_ref() or add_indirect_ref()) or add_delayed_refs(). Same applies as when having a combination of direct, indirect and indirect with missing key references. This logic had been obsoleted since commit 3ec4d3238ab165 ("btrfs: allow backref search checks for shared extents"), which introduced the early exits in case an extent is shared. So just remove that logic, and assert at find_parent_nodes() that when we have a share context we don't have a roots ulist and that we haven't found the extent to be directly shared after processing delayed references and all references from the extent tree. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 4d8573fa75b4..22ab13821ada 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1219,6 +1219,10 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, .indirect_missing_keys = PREFTREE_INIT }; + /* Roots ulist is not needed when using a sharedness check context. */ + if (sc) + ASSERT(roots == NULL); + key.objectid = bytenr; key.offset = (u64)-1; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) @@ -1310,6 +1314,20 @@ again: } } + /* + * If we have a share context and we reached here, it means the extent + * is not directly shared (no multiple reference items for it), + * otherwise we would have exited earlier with a return value of + * BACKREF_FOUND_SHARED after processing delayed references or while + * processing inline or keyed references from the extent tree. + * The extent may however be indirectly shared through shared subtrees + * as a result from creating snapshots, so we determine below what is + * its parent node, in case we are dealing with a metadata extent, or + * what's the leaf (or leaves), from a fs tree, that has a file extent + * item pointing to it in case we are dealing with a data extent. + */ + ASSERT(extent_is_shared(sc) == 0); + btrfs_release_path(path); ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); @@ -1347,11 +1365,6 @@ again: * and would retain their original ref->count < 0. */ if (roots && ref->count && ref->root_id && ref->parent == 0) { - if (sc && ref->root_id != sc->root_objectid) { - ret = BACKREF_FOUND_SHARED; - goto out; - } - /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) From 73e339e6ab74cc3edcd1f1ed3d9b822baf8534e1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:06 +0100 Subject: [PATCH 039/249] btrfs: cache sharedness of the last few data extents during fiemap During fiemap we process all the file extent items of an inode, by their file offset order (left to right b+tree order), and then check if the data extent they point at is shared or not. Until now we didn't cache those results, we only did it for b+tree nodes/leaves since for each unique b+tree path we have access to hundreds of file extent items. However, it is also common to repeat checking the sharedness of a particular data extent in a very short time window, and the cases that lead to that are the following: 1) COW writes. If have a file extent item like this: [ bytenr X, offset = 0, num_bytes = 512K ] file offset 0 512K Then a 4K write into file offset 64K happens, we end up with the following file extent item layout: [ bytenr X, offset = 0, num_bytes = 64K ] file offset 0 64K [ bytenr Y, offset = 0, num_bytes = 4K ] file offset 64K 68K [ bytenr X, offset = 68K, num_bytes = 444K ] file offset 68K 512K So during fiemap we well check for the sharedness of the data extent with bytenr X twice. Typically for COW writes and for at least moderately updated files, we end up with many file extent items that point to different sections of the same data extent. 2) Writing into a NOCOW file after a snapshot is taken. This happens if the target extent was created in a generation older than the generation where the last snapshot for the root (the tree the inode belongs to) was made. This leads to a scenario like the previous one. 3) Writing into sections of a preallocated extent. For example if a file has the following layout: [ bytenr X, offset = 0, num_bytes = 1M, type = prealloc ] 0 1M After doing a 4K write into file offset 0 and another 4K write into offset 512K, we get the following layout: [ bytenr X, offset = 0, num_bytes = 4K, type = regular ] 0 4K [ bytenr X, offset = 4K, num_bytes = 508K, type = prealloc ] 4K 512K [ bytenr X, offset = 512K, num_bytes = 4K, type = regular ] 512K 516K [ bytenr X, offset = 516K, num_bytes = 508K, type = prealloc ] 516K 1M So we end up with 4 consecutive file extent items pointing to the data extent at bytenr X. 4) Hole punching in the middle of an extent. For example if a file has the following file extent item: [ bytenr X, offset = 0, num_bytes = 8M ] 0 8M And then hole is punched for the file range [4M, 6M[, we our file extent item split into two: [ bytenr X, offset = 0, num_bytes = 4M ] 0 4M [ 2M hole, implicit or explicit depending on NO_HOLES feature ] 4M 6M [ bytenr X, offset = 6M, num_bytes = 2M ] 6M 8M Again, we end up with two file extent items pointing to the same data extent. 5) When reflinking (clone and deduplication) within the same file. This is probably the least common case of all. In cases 1, 2, 4 and 4, when we have multiple file extent items that point to the same data extent, their distance is usually short, typically separated by a few slots in a b+tree leaf (or across sibling leaves). For case 5, the distance can vary a lot, but it's typically the less common case. This change caches the result of the sharedness checks for data extents, but only for the last 8 extents that we notice that our inode refers to with multiple file extent items. Whenever we want to check if a data extent is shared, we lookup the cache which consists of doing a linear scan of an 8 elements array, and if we find the data extent there, we return the result and don't check the extent tree and delayed refs. The array/cache is small so that doing the search has no noticeable negative impact on the performance in case we don't have file extent items within a distance of 8 slots that point to the same data extent. Slots in the cache/array are overwritten in a simple round robin fashion, as that approach fits very well. Using this simple approach with only the last 8 data extents seen is effective as usually when multiple file extents items point to the same data extent, their distance is within 8 slots. It also uses very little memory and the time to cache a result or lookup the cache is negligible. The following test was run on non-debug kernel (Debian's default kernel config) to measure the impact in the case of COW writes (first example given above), where we run fiemap after overwriting 33% of the blocks of a file: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1 * 1024 * 1024 * 1024)) # Create the file full of 1M extents. xfs_io -f -s -c "pwrite -b 1M -S 0xab 0 $FILE_SIZE" $MNT/foobar block_count=$((FILE_SIZE / 4096)) # Overwrite about 33% of the file blocks. overwrite_count=$((block_count / 3)) echo -e "\nOverwriting $overwrite_count 4K blocks (out of $block_count)..." RANDOM=123 for ((i = 1; i <= $overwrite_count; i++)); do off=$(((RANDOM % block_count) * 4096)) xfs_io -c "pwrite -S 0xcd $off 4K" $MNT/foobar > /dev/null echo -ne "\r$i blocks overwritten..." done echo -e "\n" # Unmount and mount to clear all cached metadata. umount $MNT mount $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds" umount $MNT Result before applying this patch: fiemap took 128 milliseconds Result after applying this patch: fiemap took 92 milliseconds (-28.1%) The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 50 +++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/backref.h | 27 +++++++++++++++++++++++++ 2 files changed, 74 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 22ab13821ada..64bea9b30f6b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -137,7 +137,25 @@ struct preftrees { struct share_check { u64 root_objectid; u64 inum; + u64 data_bytenr; + /* + * Counts number of inodes that refer to an extent (different inodes in + * the same root or different roots) that we could find. The sharedness + * check typically stops once this counter gets greater than 1, so it + * may not reflect the total number of inodes. + */ int share_count; + /* + * The number of times we found our inode refers to the data extent we + * are determining the sharedness. In other words, how many file extent + * items we could find for our inode that point to our target data + * extent. The value we get here after finishing the extent sharedness + * check may be smaller than reality, but if it ends up being greater + * than 1, then we know for sure the inode has multiple file extent + * items that point to our inode, and we can safely assume it's useful + * to cache the sharedness check result. + */ + int self_ref_count; bool have_delayed_delete_refs; }; @@ -207,7 +225,7 @@ static int prelim_ref_compare(struct prelim_ref *ref1, } static void update_share_count(struct share_check *sc, int oldcount, - int newcount) + int newcount, struct prelim_ref *newref) { if ((!sc) || (oldcount == 0 && newcount < 1)) return; @@ -216,6 +234,11 @@ static void update_share_count(struct share_check *sc, int oldcount, sc->share_count--; else if (oldcount < 1 && newcount > 0) sc->share_count++; + + if (newref->root_id == sc->root_objectid && + newref->wanted_disk_byte == sc->data_bytenr && + newref->key_for_search.objectid == sc->inum) + sc->self_ref_count += newref->count; } /* @@ -266,14 +289,14 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info, * BTRFS_[ADD|DROP]_DELAYED_REF actions. */ update_share_count(sc, ref->count, - ref->count + newref->count); + ref->count + newref->count, newref); ref->count += newref->count; free_pref(newref); return; } } - update_share_count(sc, 0, newref->count); + update_share_count(sc, 0, newref->count, newref); preftree->count++; trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count); rb_link_node(&newref->rbnode, parent, p); @@ -1724,11 +1747,18 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, struct share_check shared = { .root_objectid = root->root_key.objectid, .inum = btrfs_ino(inode), + .data_bytenr = bytenr, .share_count = 0, + .self_ref_count = 0, .have_delayed_delete_refs = false, }; int level; + for (int i = 0; i < BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; i++) { + if (ctx->prev_extents_cache[i].bytenr == bytenr) + return ctx->prev_extents_cache[i].is_shared; + } + ulist_init(&ctx->refs); trans = btrfs_join_transaction_nostart(root); @@ -1813,6 +1843,20 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, cond_resched(); } + /* + * Cache the sharedness result for the data extent if we know our inode + * has more than 1 file extent item that refers to the data extent. + */ + if (ret >= 0 && shared.self_ref_count > 1) { + int slot = ctx->prev_extents_cache_slot; + + ctx->prev_extents_cache[slot].bytenr = shared.data_bytenr; + ctx->prev_extents_cache[slot].is_shared = (ret == 1); + + slot = (slot + 1) % BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE; + ctx->prev_extents_cache_slot = slot; + } + if (trans) { btrfs_put_tree_mod_seq(fs_info, &elem); btrfs_end_transaction(trans); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 5f468f0defda..fda78db50be6 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -23,6 +23,8 @@ struct btrfs_backref_shared_cache_entry { bool is_shared; }; +#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8 + struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; @@ -32,6 +34,31 @@ struct btrfs_backref_share_check_ctx { */ struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL]; bool use_path_cache; + /* + * Cache the sharedness result for the last few extents we have found, + * but only for extents for which we have multiple file extent items + * that point to them. + * It's very common to have several file extent items that point to the + * same extent (bytenr) but with different offsets and lengths. This + * typically happens for COW writes, partial writes into prealloc + * extents, NOCOW writes after snapshoting a root, hole punching or + * reflinking within the same file (less common perhaps). + * So keep a small cache with the lookup results for the extent pointed + * by the last few file extent items. This cache is checked, with a + * linear scan, whenever btrfs_is_data_extent_shared() is called, so + * it must be small so that it does not negatively affect performance in + * case we don't have multiple file extent items that point to the same + * data extent. + */ + struct { + u64 bytenr; + bool is_shared; + } prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE]; + /* + * The slot in the prev_extents_cache array that will be used for + * storing the sharedness result of a new data extent. + */ + int prev_extents_cache_slot; }; typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, From 583f4ac56254c844cbbc9f23744e1f2efbf42fc7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:07 +0100 Subject: [PATCH 040/249] btrfs: move up backref sharedness cache store and lookup functions Move the static functions to lookup and store sharedness check of an extent buffer to a location above find_all_parents(), because in the next patch the lookup function will be used by find_all_parents(). The store function is also moved just because it's the counter part to the lookup function and it's best to have their definitions close together. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 236 ++++++++++++++++++++++----------------------- 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 64bea9b30f6b..1b1575b3a7b0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1198,6 +1198,124 @@ static int add_keyed_refs(struct btrfs_root *extent_root, return ret; } +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool *is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + + if (!ctx->use_path_cache) + return false; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return false; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + entry = &ctx->path_cache_entries[level]; + + /* Unused cache entry or being used for some other extent buffer. */ + if (entry->bytenr != bytenr) + return false; + + /* + * We cached a false result, but the last snapshot generation of the + * root changed, so we now have a snapshot. Don't trust the result. + */ + if (!entry->is_shared && + entry->gen != btrfs_root_last_snapshot(&root->root_item)) + return false; + + /* + * If we cached a true result and the last generation used for dropping + * a root changed, we can not trust the result, because the dropped root + * could be a snapshot sharing this extent buffer. + */ + if (entry->is_shared && + entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) + return false; + + *is_shared = entry->is_shared; + /* + * If the node at this level is shared, than all nodes below are also + * shared. Currently some of the nodes below may be marked as not shared + * because we have just switched from one leaf to another, and switched + * also other nodes above the leaf and below the current level, so mark + * them as shared. + */ + if (*is_shared) { + for (int i = 0; i < level; i++) { + ctx->path_cache_entries[i].is_shared = true; + ctx->path_cache_entries[i].gen = entry->gen; + } + } + + return true; +} + +/* + * The caller has joined a transaction or is holding a read lock on the + * fs_info->commit_root_sem semaphore, so no need to worry about the root's last + * snapshot field changing while updating or checking the cache. + */ +static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, + struct btrfs_root *root, + u64 bytenr, int level, bool is_shared) +{ + struct btrfs_backref_shared_cache_entry *entry; + u64 gen; + + if (!ctx->use_path_cache) + return; + + if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) + return; + + /* + * Level -1 is used for the data extent, which is not reliable to cache + * because its reference count can increase or decrease without us + * realizing. We cache results only for extent buffers that lead from + * the root node down to the leaf with the file extent item. + */ + ASSERT(level >= 0); + + if (is_shared) + gen = btrfs_get_last_root_drop_gen(root->fs_info); + else + gen = btrfs_root_last_snapshot(&root->root_item); + + entry = &ctx->path_cache_entries[level]; + entry->bytenr = bytenr; + entry->is_shared = is_shared; + entry->gen = gen; + + /* + * If we found an extent buffer is shared, set the cache result for all + * extent buffers below it to true. As nodes in the path are COWed, + * their sharedness is moved to their children, and if a leaf is COWed, + * then the sharedness of a data extent becomes direct, the refcount of + * data extent is increased in the extent item at the extent tree. + */ + if (is_shared) { + for (int i = 0; i < level; i++) { + entry = &ctx->path_cache_entries[i]; + entry->is_shared = is_shared; + entry->gen = gen; + } + } +} + /* * this adds all existing backrefs (inline backrefs, backrefs and delayed * refs) for the given bytenr to the refs list, merges duplicates and resolves @@ -1573,124 +1691,6 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } -/* - * The caller has joined a transaction or is holding a read lock on the - * fs_info->commit_root_sem semaphore, so no need to worry about the root's last - * snapshot field changing while updating or checking the cache. - */ -static bool lookup_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, - struct btrfs_root *root, - u64 bytenr, int level, bool *is_shared) -{ - struct btrfs_backref_shared_cache_entry *entry; - - if (!ctx->use_path_cache) - return false; - - if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) - return false; - - /* - * Level -1 is used for the data extent, which is not reliable to cache - * because its reference count can increase or decrease without us - * realizing. We cache results only for extent buffers that lead from - * the root node down to the leaf with the file extent item. - */ - ASSERT(level >= 0); - - entry = &ctx->path_cache_entries[level]; - - /* Unused cache entry or being used for some other extent buffer. */ - if (entry->bytenr != bytenr) - return false; - - /* - * We cached a false result, but the last snapshot generation of the - * root changed, so we now have a snapshot. Don't trust the result. - */ - if (!entry->is_shared && - entry->gen != btrfs_root_last_snapshot(&root->root_item)) - return false; - - /* - * If we cached a true result and the last generation used for dropping - * a root changed, we can not trust the result, because the dropped root - * could be a snapshot sharing this extent buffer. - */ - if (entry->is_shared && - entry->gen != btrfs_get_last_root_drop_gen(root->fs_info)) - return false; - - *is_shared = entry->is_shared; - /* - * If the node at this level is shared, than all nodes below are also - * shared. Currently some of the nodes below may be marked as not shared - * because we have just switched from one leaf to another, and switched - * also other nodes above the leaf and below the current level, so mark - * them as shared. - */ - if (*is_shared) { - for (int i = 0; i < level; i++) { - ctx->path_cache_entries[i].is_shared = true; - ctx->path_cache_entries[i].gen = entry->gen; - } - } - - return true; -} - -/* - * The caller has joined a transaction or is holding a read lock on the - * fs_info->commit_root_sem semaphore, so no need to worry about the root's last - * snapshot field changing while updating or checking the cache. - */ -static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx, - struct btrfs_root *root, - u64 bytenr, int level, bool is_shared) -{ - struct btrfs_backref_shared_cache_entry *entry; - u64 gen; - - if (!ctx->use_path_cache) - return; - - if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL)) - return; - - /* - * Level -1 is used for the data extent, which is not reliable to cache - * because its reference count can increase or decrease without us - * realizing. We cache results only for extent buffers that lead from - * the root node down to the leaf with the file extent item. - */ - ASSERT(level >= 0); - - if (is_shared) - gen = btrfs_get_last_root_drop_gen(root->fs_info); - else - gen = btrfs_root_last_snapshot(&root->root_item); - - entry = &ctx->path_cache_entries[level]; - entry->bytenr = bytenr; - entry->is_shared = is_shared; - entry->gen = gen; - - /* - * If we found an extent buffer is shared, set the cache result for all - * extent buffers below it to true. As nodes in the path are COWed, - * their sharedness is moved to their children, and if a leaf is COWed, - * then the sharedness of a data extent becomes direct, the refcount of - * data extent is increased in the extent item at the extent tree. - */ - if (is_shared) { - for (int i = 0; i < level; i++) { - entry = &ctx->path_cache_entries[i]; - entry->is_shared = is_shared; - entry->gen = gen; - } - } -} - struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void) { struct btrfs_backref_share_check_ctx *ctx; From 877c14767f106a5c8af271ebf1294c514eb76d0c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:08 +0100 Subject: [PATCH 041/249] btrfs: avoid duplicated resolution of indirect backrefs during fiemap During fiemap, when determining if a data extent is shared or not, if we don't find the extent is directly shared, then we need to determine if it's shared through subtrees. For that we need to resolve the indirect reference we found in order to figure out the path in the inode's fs tree, which is a path starting at the fs tree's root node and going down to the leaf that contains the file extent item that points to the data extent. We then proceed to determine if any extent buffer in that path is shared with other trees or not. Currently whenever we find the data extent that a file extent item points to is not directly shared, we always resolve the path in the fs tree, and then check if any extent buffer in the path is shared. This is a lot of work and when we have file extent items that belong to the same leaf, we have the same path, so we only need to calculate it once. This change does that, it keeps track of the current and previous leaf, and when we find that a data extent is not directly shared, we try to compute the fs tree path only once and then use it for every other file extent item in the same leaf, using the existing cached path result for the leaf as long as the cache results are valid. This saves us from doing expensive b+tree searches in the fs tree of our target inode, as well as other minor work. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test-with-snapshots.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV # Use compression to quickly create files with a lot of extents # (each with a size of 128K). mount -o compress=lzo $DEV $MNT # 40G gives 327680 extents, each with a size of 128K. xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar # Add some more files to increase the size of the fs and extent # trees (in the real world there's a lot of files and extents # from other files). xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1 xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2 xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3 # Create a snapshot so all the extents become indirectly shared # through subtrees, with a generation less than or equals to the # generation used to create the snapshot. btrfs subvolume snapshot -r $MNT $MNT/snap1 umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" echo start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Result before applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1204 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 729 milliseconds (metadata cached) Result after applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 732 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 421 milliseconds (metadata cached) That's a -46.1% total reduction for the metadata not cached case, and a -42.2% reduction for the cached metadata case. The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 64 +++++++++++++++++++++++++++++++++++++------- fs/btrfs/backref.h | 13 +++++++++ fs/btrfs/extent_io.c | 2 ++ 3 files changed, 69 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 1b1575b3a7b0..94a3c6deafbb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,8 +16,9 @@ #include "misc.h" #include "tree-mod-log.h" -/* Just an arbitrary number so we can be sure this happened */ -#define BACKREF_FOUND_SHARED 6 +/* Just arbitrary numbers so we can be sure one of these happened. */ +#define BACKREF_FOUND_SHARED 6 +#define BACKREF_FOUND_NOT_SHARED 7 struct extent_inode_elem { u64 inum; @@ -135,7 +136,8 @@ struct preftrees { * - decremented when a ref->count transitions to <1 */ struct share_check { - u64 root_objectid; + struct btrfs_backref_share_check_ctx *ctx; + struct btrfs_root *root; u64 inum; u64 data_bytenr; /* @@ -235,7 +237,7 @@ static void update_share_count(struct share_check *sc, int oldcount, else if (oldcount < 1 && newcount > 0) sc->share_count++; - if (newref->root_id == sc->root_objectid && + if (newref->root_id == sc->root->root_key.objectid && newref->wanted_disk_byte == sc->data_bytenr && newref->key_for_search.objectid == sc->inum) sc->self_ref_count += newref->count; @@ -742,7 +744,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; } - if (sc && ref->root_id != sc->root_objectid) { + if (sc && ref->root_id != sc->root->root_key.objectid) { free_pref(ref); ret = BACKREF_FOUND_SHARED; goto out; @@ -1469,6 +1471,44 @@ again: */ ASSERT(extent_is_shared(sc) == 0); + /* + * If we are here for a data extent and we have a share_check structure + * it means the data extent is not directly shared (does not have + * multiple reference items), so we have to check if a path in the fs + * tree (going from the root node down to the leaf that has the file + * extent item pointing to the data extent) is shared, that is, if any + * of the extent buffers in the path is referenced by other trees. + */ + if (sc && bytenr == sc->data_bytenr) { + /* + * If we are only determining if a data extent is shared or not + * and the corresponding file extent item is located in the same + * leaf as the previous file extent item, we can skip resolving + * indirect references for a data extent, since the fs tree path + * is the same (same leaf, so same path). We skip as long as the + * cached result for the leaf is valid and only if there's only + * one file extent item pointing to the data extent, because in + * the case of multiple file extent items, they may be located + * in different leaves and therefore we have multiple paths. + */ + if (sc->ctx->curr_leaf_bytenr == sc->ctx->prev_leaf_bytenr && + sc->self_ref_count == 1) { + bool cached; + bool is_shared; + + cached = lookup_backref_shared_cache(sc->ctx, sc->root, + sc->ctx->curr_leaf_bytenr, + 0, &is_shared); + if (cached) { + if (is_shared) + ret = BACKREF_FOUND_SHARED; + else + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + } + } + btrfs_release_path(path); ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); @@ -1745,7 +1785,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { - .root_objectid = root->root_key.objectid, + .ctx = ctx, + .root = root, .inum = btrfs_ino(inode), .data_bytenr = bytenr, .share_count = 0, @@ -1783,12 +1824,13 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, NULL, NULL, &shared, false); - if (ret == BACKREF_FOUND_SHARED) { - /* this is the only condition under which we return 1 */ - ret = 1; + if (ret == BACKREF_FOUND_SHARED || + ret == BACKREF_FOUND_NOT_SHARED) { + /* If shared must return 1, otherwise return 0. */ + ret = (ret == BACKREF_FOUND_SHARED) ? 1 : 0; if (level >= 0) store_backref_shared_cache(ctx, root, bytenr, - level, true); + level, ret == 1); break; } if (ret < 0 && ret != -ENOENT) @@ -1865,6 +1907,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, } out: ulist_release(&ctx->refs); + ctx->prev_leaf_bytenr = ctx->curr_leaf_bytenr; + return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index fda78db50be6..6dac462430b0 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -28,6 +28,19 @@ struct btrfs_backref_shared_cache_entry { struct btrfs_backref_share_check_ctx { /* Ulists used during backref walking. */ struct ulist refs; + /* + * The current leaf the caller of btrfs_is_data_extent_shared() is at. + * Typically the caller (at the moment only fiemap) tries to determine + * the sharedness of data extents point by file extent items from entire + * leaves. + */ + u64 curr_leaf_bytenr; + /* + * The previous leaf the caller was at in the previous call to + * btrfs_is_data_extent_shared(). This may be the same as the current + * leaf. On the first call it must be 0. + */ + u64 prev_leaf_bytenr; /* * A path from a root to a leaf that has a file extent item pointing to * a given data extent should never exceed the maximum b+tree height. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e25e54d2216f..4e4f28387ace 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3966,6 +3966,8 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, if (extent_end <= lockstart) goto next_item; + backref_ctx->curr_leaf_bytenr = leaf->start; + /* We have in implicit hole (NO_HOLES feature enabled). */ if (prev_extent_end < key.offset) { const u64 range_end = min(key.offset, lockend) - 1; From 6976201f188f0a899fe5f0dfde32b9855d1d22cc Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Oct 2022 13:17:09 +0100 Subject: [PATCH 042/249] btrfs: avoid unnecessary resolution of indirect backrefs during fiemap During fiemap, when determining if a data extent is shared or not, if we don't find the extent is directly shared, then we need to determine if it's shared through subtrees. For that we need to resolve the indirect reference we found in order to figure out the path in the inode's fs tree, which is a path starting at the fs tree's root node and going down to the leaf that contains the file extent item that points to the data extent. We then proceed to determine if any extent buffer in that path is shared with other trees or not. However when the generation of the data extent is more recent than the last generation used to snapshot the root, we don't need to determine the path, since the data extent can not be shared through snapshots. For this case we currently still determine the leaf of that path (at find_parent_nodes(), but then stop determining the other nodes in the path (at btrfs_is_data_extent_shared()) as it's pointless. So do the check of the data extent's generation earlier, at find_parent_nodes(), before trying to resolve the indirect reference to determine the leaf in the path. This saves us from doing one expensive b+tree search in the fs tree of our target inode, as well as other minor work. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test-fiemap.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi umount $DEV &> /dev/null mkfs.btrfs -f $DEV # Use compression to quickly create files with a lot of extents # (each with a size of 128K). mount -o compress=lzo $DEV $MNT # 40G gives 327680 extents, each with a size of 128K. xfs_io -f -c "pwrite -S 0xab -b 1M 0 40G" $MNT/foobar # Add some more files to increase the size of the fs and extent # trees (in the real world there's a lot of files and extents # from other files). xfs_io -f -c "pwrite -S 0xcd -b 1M 0 20G" $MNT/file1 xfs_io -f -c "pwrite -S 0xef -b 1M 0 20G" $MNT/file2 xfs_io -f -c "pwrite -S 0x73 -b 1M 0 20G" $MNT/file3 umount $MNT mount -o compress=lzo $DEV $MNT start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata not cached)" echo start=$(date +%s%N) filefrag $MNT/foobar end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "fiemap took $dur milliseconds (metadata cached)" umount $MNT Before applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 1285 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 742 milliseconds (metadata cached) After applying this patch: (...) /mnt/sdi/foobar: 327680 extents found fiemap took 689 milliseconds (metadata not cached) /mnt/sdi/foobar: 327680 extents found fiemap took 393 milliseconds (metadata cached) That's a -46.4% total reduction for the metadata not cached case, and a -47.0% reduction for the cached metadata case. The test is somewhat limited in the sense the gains may be higher in practice, because in the test the filesystem is small, so we have small fs and extent trees, plus there's no concurrent access to the trees as well, therefore no lock contention there. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 94a3c6deafbb..232f49415ab9 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -140,6 +140,7 @@ struct share_check { struct btrfs_root *root; u64 inum; u64 data_bytenr; + u64 data_extent_gen; /* * Counts number of inodes that refer to an extent (different inodes in * the same root or different roots) that we could find. The sharedness @@ -1480,6 +1481,21 @@ again: * of the extent buffers in the path is referenced by other trees. */ if (sc && bytenr == sc->data_bytenr) { + /* + * If our data extent is from a generation more recent than the + * last generation used to snapshot the root, then we know that + * it can not be shared through subtrees, so we can skip + * resolving indirect references, there's no point in + * determining the extent buffers for the path from the fs tree + * root node down to the leaf that has the file extent item that + * points to the data extent. + */ + if (sc->data_extent_gen > + btrfs_root_last_snapshot(&sc->root->root_item)) { + ret = BACKREF_FOUND_NOT_SHARED; + goto out; + } + /* * If we are only determining if a data extent is shared or not * and the corresponding file extent item is located in the same @@ -1789,6 +1805,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, .root = root, .inum = btrfs_ino(inode), .data_bytenr = bytenr, + .data_extent_gen = extent_gen, .share_count = 0, .self_ref_count = 0, .have_delayed_delete_refs = false, @@ -1836,17 +1853,6 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, if (ret < 0 && ret != -ENOENT) break; ret = 0; - /* - * If our data extent is not shared through reflinks and it was - * created in a generation after the last one used to create a - * snapshot of the inode's root, then it can not be shared - * indirectly through subtrees, as that can only happen with - * snapshots. In this case bail out, no need to check for the - * sharedness of extent buffers. - */ - if (level == -1 && - extent_gen > btrfs_root_last_snapshot(&root->root_item)) - break; /* * If our data extent was not directly shared (without multiple From cc4804bfd6392bcb626a687689b36fd9cda87c11 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 13 Oct 2022 15:52:09 -0700 Subject: [PATCH 043/249] btrfs: skip reclaim if block_group is empty As we delete extents from a block group, at some deletion we cross below the reclaim threshold. It is possible we are still in the middle of deleting more extents and might soon hit 0. If the block group is empty by the time the reclaim worker runs, we will still relocate it. This works just fine, as relocating an empty block group ultimately results in properly deleting it. However, we have more direct ways of removing empty block groups in the cleaner thread. Those are either async discard or the unused_bgs list. In fact, when we decide whether to relocate a block group during extent deletion, we do check for emptiness and prefer the discard/unused_bgs mechanisms when possible. Not using relocation for this case reduces some modest overhead from empty bg relocation: - extra transactions - extra metadata use/churn for creating relocation metadata - trying to read the extent tree to look for extents (and in this case finding none) Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 3f8b1cbbbc43..684401aa014a 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1606,6 +1606,24 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } spin_unlock(&bg->lock); /* Get out fast, in case we're unmounting the filesystem */ From 81531225e5bd50ce628816cc1445c8b52aa99db2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Thu, 13 Oct 2022 15:52:10 -0700 Subject: [PATCH 044/249] btrfs: re-check reclaim condition in reclaim worker I have observed the following case play out and lead to unnecessary relocations: 1. write a file across multiple block groups 2. delete the file 3. several block groups fall below the reclaim threshold 4. reclaim the first, moving extents into the others 5. reclaim the others which are now actually very full, leading to poor reclaim behavior with lots of writing, allocating new block groups, etc. I believe the risk of missing some reasonable reclaims is worth it when traded off against the savings of avoiding overfull reclaims. Going forward, it could be interesting to make the check more advanced (zoned aware, fragmentation aware, etc...) so that it can be a really strong signal both at extent delete and reclaim time. Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 65 ++++++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 684401aa014a..b1f4acdeafb9 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1539,6 +1539,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) return true; } +static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1623,6 +1647,22 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) spin_unlock(&bg->lock); up_write(&space_info->groups_sem); goto next; + + } + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; } spin_unlock(&bg->lock); @@ -3241,31 +3281,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } -static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, - u64 bytes_freed) -{ - const struct btrfs_space_info *space_info = bg->space_info; - const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); - const u64 new_val = bg->used; - const u64 old_val = new_val + bytes_freed; - u64 thresh; - - if (reclaim_thresh == 0) - return false; - - thresh = div_factor_fine(bg->length, reclaim_thresh); - - /* - * If we were below the threshold before don't reclaim, we are likely a - * brand new block group and we don't want to relocate new block groups. - */ - if (old_val < thresh) - return false; - if (new_val >= thresh) - return false; - return true; -} - int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { From 879b2221983184cd61fe852f90f3c0d6dd647f67 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 13 Oct 2022 11:36:25 +0100 Subject: [PATCH 045/249] btrfs: switch GFP_ATOMIC to GFP_NOFS when fixing up low keys When fixing up the first key of each node above the current level, at fixup_low_keys(), we are doing a GFP_ATOMIC allocation for inserting an operation record for the tree mod log. However we can do just fine with GFP_NOFS nowadays. The need for GFP_ATOMIC was for the old days when we had custom locks with spinning behaviour for extent buffers and we were in spinning mode while at fixup_low_keys(). Now we use rw semaphores for extent buffer locks, so we can safely use GFP_NOFS. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5f461e05cc9c..e6182d530377 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2418,7 +2418,7 @@ static void fixup_low_keys(struct btrfs_path *path, break; t = path->nodes[i]; ret = btrfs_tree_mod_log_insert_key(t, tslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); From 33cff222faffefc57f57326547e68133a7816e29 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 14 Oct 2022 14:44:33 +0100 Subject: [PATCH 046/249] btrfs: remove gfp_t flag from btrfs_tree_mod_log_insert_key() All callers of btrfs_tree_mod_log_insert_key() are now passing a GFP_NOFS flag to it, so remove the flag from it and from alloc_tree_mod_elem() and use it directly within alloc_tree_mod_elem(). Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/tree-mod-log.c | 19 +++++++++---------- fs/btrfs/tree-mod-log.h | 2 +- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e6182d530377..d645933ef135 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -489,7 +489,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); btrfs_tree_mod_log_insert_key(parent, parent_slot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, @@ -1018,7 +1018,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1064,7 +1064,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1166,7 +1166,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -1220,7 +1220,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, btrfs_node_key(right, &disk_key, 0); ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2418,7 +2418,7 @@ static void fixup_low_keys(struct btrfs_path *path, break; t = path->nodes[i]; ret = btrfs_tree_mod_log_insert_key(t, tslot, - BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REPLACE); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -2779,7 +2779,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans, } if (level) { ret = btrfs_tree_mod_log_insert_key(lower, slot, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -4219,7 +4219,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, (nritems - slot - 1)); } else if (level) { ret = btrfs_tree_mod_log_insert_key(parent, slot, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); BUG_ON(ret < 0); } diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 8a3a14686d3e..18b4699bcb11 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -197,12 +197,11 @@ static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, - gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) return NULL; @@ -220,7 +219,7 @@ static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, } int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags) + enum btrfs_mod_log_op op) { struct tree_mod_elem *tm; int ret; @@ -228,7 +227,7 @@ int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, if (!tree_mod_need_log(eb->fs_info, eb)) return 0; - tm = alloc_tree_mod_elem(eb, slot, op, flags); + tm = alloc_tree_mod_elem(eb, slot, op); if (!tm) return -ENOMEM; @@ -276,7 +275,7 @@ int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -364,7 +363,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -502,14 +501,14 @@ int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, tm_list_rem = tm_list + nr_items; for (i = 0; i < nr_items; i++) { tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE); if (!tm_list_rem[i]) { ret = -ENOMEM; goto free_tms; } tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + BTRFS_MOD_LOG_KEY_ADD); if (!tm_list_add[i]) { ret = -ENOMEM; goto free_tms; @@ -564,7 +563,7 @@ int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i, - BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 12605d19621b..8cffe0bc2a39 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -32,7 +32,7 @@ int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, struct extent_buffer *new_root, bool log_removal); int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum btrfs_mod_log_op op, gfp_t flags); + enum btrfs_mod_log_op op); int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, From 5565b8e0adcdd8ee32d02d82d98cdf6d966793c7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 12 Oct 2022 17:22:35 +0800 Subject: [PATCH 047/249] btrfs: make module init/exit match their sequence [BACKGROUND] In theory init_btrfs_fs() and exit_btrfs_fs() should match their sequence, thus normally they should look like this: init_btrfs_fs() | exit_btrfs_fs() ----------------------+------------------------ init_A(); | init_B(); | init_C(); | | exit_C(); | exit_B(); | exit_A(); So is for the error path of init_btrfs_fs(). But it's not the case, some exit functions don't match their init functions sequence in init_btrfs_fs(). Furthermore in init_btrfs_fs(), we need to have a new error label for each new init function we added. This is not really expandable, especially recently we may add several new functions to init_btrfs_fs(). [ENHANCEMENT] The patch will introduce the following things to enhance the situation: - struct init_sequence Just a wrapper of init and exit function pointers. The init function must use int type as return value, thus some init functions need to be updated to return 0. The exit function can be NULL, as there are some init sequence just outputting a message. - struct mod_init_seq[] array This is a const array, recording all the initialization we need to do in init_btrfs_fs(), and the order follows the old init_btrfs_fs(). - bool mod_init_result[] array This is a bool array, recording if we have initialized one entry in mod_init_seq[]. The reason to split mod_init_seq[] and mod_init_result[] is to avoid section mismatch in reference. All init function are in .init.text, but if mod_init_seq[] records the @initialized member it can no longer be const, thus will be put into .data section, and cause modpost warning. For init_btrfs_fs() we just call all init functions in their order in mod_init_seq[] array, and after each call, setting corresponding mod_init_result[] to true. For exit_btrfs_fs() and error handling path of init_btrfs_fs(), we just iterate mod_init_seq[] in reverse order, and skip all uninitialized entry. With this patch, init_btrfs_fs()/exit_btrfs_fs() will be much easier to expand and will always follow the strict order. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/compression.c | 3 +- fs/btrfs/compression.h | 2 +- fs/btrfs/props.c | 3 +- fs/btrfs/props.h | 2 +- fs/btrfs/super.c | 252 ++++++++++++++++++++--------------------- 5 files changed, 127 insertions(+), 135 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e6635fe70067..65a4e5087215 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -1243,12 +1243,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, return ret; } -void __init btrfs_init_compress(void) +int __init btrfs_init_compress(void) { btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + return 0; } void __cold btrfs_exit_compress(void) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 1aa02903de69..9da2343eeff5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -77,7 +77,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level) return ((type_level & 0xF0) >> 4); } -void __init btrfs_init_compress(void); +int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 07f62e3ba6a5..e04289347775 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -454,7 +454,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return 0; } -void __init btrfs_props_init(void) +int __init btrfs_props_init(void) { int i; @@ -464,5 +464,6 @@ void __init btrfs_props_init(void) hash_add(prop_handlers_ht, &p->node, h); } + return 0; } diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index ca9dd3df129b..6e283196e38a 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -8,7 +8,7 @@ #include "ctree.h" -void __init btrfs_props_init(void); +int __init btrfs_props_init(void); int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const char *value, size_t value_len, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a93fbd29494..3a33dd9847d9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2694,7 +2694,7 @@ static __cold void btrfs_interface_exit(void) misc_deregister(&btrfs_misc); } -static void __init btrfs_print_mod_info(void) +static int __init btrfs_print_mod_info(void) { static const char options[] = "" #ifdef CONFIG_BTRFS_DEBUG @@ -2721,143 +2721,133 @@ static void __init btrfs_print_mod_info(void) #endif ; pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options); + return 0; +} + +static int register_btrfs(void) +{ + return register_filesystem(&btrfs_fs_type); +} + +static void unregister_btrfs(void) +{ + unregister_filesystem(&btrfs_fs_type); +} + +/* Helper structure for long init/exit functions. */ +struct init_sequence { + int (*init_func)(void); + /* Can be NULL if the init_func doesn't need cleanup. */ + void (*exit_func)(void); +}; + +static const struct init_sequence mod_init_seq[] = { + { + .init_func = btrfs_props_init, + .exit_func = NULL, + }, { + .init_func = btrfs_init_sysfs, + .exit_func = btrfs_exit_sysfs, + }, { + .init_func = btrfs_init_compress, + .exit_func = btrfs_exit_compress, + }, { + .init_func = btrfs_init_cachep, + .exit_func = btrfs_destroy_cachep, + }, { + .init_func = btrfs_transaction_init, + .exit_func = btrfs_transaction_exit, + }, { + .init_func = btrfs_ctree_init, + .exit_func = btrfs_ctree_exit, + }, { + .init_func = btrfs_free_space_init, + .exit_func = btrfs_free_space_exit, + }, { + .init_func = extent_state_init_cachep, + .exit_func = extent_state_free_cachep, + }, { + .init_func = extent_buffer_init_cachep, + .exit_func = extent_buffer_free_cachep, + }, { + .init_func = btrfs_bioset_init, + .exit_func = btrfs_bioset_exit, + }, { + .init_func = extent_map_init, + .exit_func = extent_map_exit, + }, { + .init_func = ordered_data_init, + .exit_func = ordered_data_exit, + }, { + .init_func = btrfs_delayed_inode_init, + .exit_func = btrfs_delayed_inode_exit, + }, { + .init_func = btrfs_auto_defrag_init, + .exit_func = btrfs_auto_defrag_exit, + }, { + .init_func = btrfs_delayed_ref_init, + .exit_func = btrfs_delayed_ref_exit, + }, { + .init_func = btrfs_prelim_ref_init, + .exit_func = btrfs_prelim_ref_exit, + }, { + .init_func = btrfs_interface_init, + .exit_func = btrfs_interface_exit, + }, { + .init_func = btrfs_print_mod_info, + .exit_func = NULL, + }, { + .init_func = btrfs_run_sanity_tests, + .exit_func = NULL, + }, { + .init_func = register_btrfs, + .exit_func = unregister_btrfs, + } +}; + +static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; + +static void __exit exit_btrfs_fs(void) +{ + int i; + + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } } static int __init init_btrfs_fs(void) { - int err; - - btrfs_props_init(); - - err = btrfs_init_sysfs(); - if (err) - return err; - - btrfs_init_compress(); - - err = btrfs_init_cachep(); - if (err) - goto free_compress; - - err = btrfs_transaction_init(); - if (err) - goto free_cachep; - - err = btrfs_ctree_init(); - if (err) - goto free_transaction; - - err = btrfs_free_space_init(); - if (err) - goto free_ctree; - - err = extent_state_init_cachep(); - if (err) - goto free_free_space; - - err = extent_buffer_init_cachep(); - if (err) - goto free_extent_cachep; - - err = btrfs_bioset_init(); - if (err) - goto free_eb_cachep; - - err = extent_map_init(); - if (err) - goto free_bioset; - - err = ordered_data_init(); - if (err) - goto free_extent_map; - - err = btrfs_delayed_inode_init(); - if (err) - goto free_ordered_data; - - err = btrfs_auto_defrag_init(); - if (err) - goto free_delayed_inode; - - err = btrfs_delayed_ref_init(); - if (err) - goto free_auto_defrag; - - err = btrfs_prelim_ref_init(); - if (err) - goto free_delayed_ref; - - err = btrfs_interface_init(); - if (err) - goto free_prelim_ref; - - btrfs_print_mod_info(); - - err = btrfs_run_sanity_tests(); - if (err) - goto unregister_ioctl; - - err = register_filesystem(&btrfs_fs_type); - if (err) - goto unregister_ioctl; + int ret; + int i; + for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { + ASSERT(!mod_init_result[i]); + ret = mod_init_seq[i].init_func(); + if (ret < 0) + goto error; + mod_init_result[i] = true; + } return 0; -unregister_ioctl: - btrfs_interface_exit(); -free_prelim_ref: - btrfs_prelim_ref_exit(); -free_delayed_ref: - btrfs_delayed_ref_exit(); -free_auto_defrag: - btrfs_auto_defrag_exit(); -free_delayed_inode: - btrfs_delayed_inode_exit(); -free_ordered_data: - ordered_data_exit(); -free_extent_map: - extent_map_exit(); -free_bioset: - btrfs_bioset_exit(); -free_eb_cachep: - extent_buffer_free_cachep(); -free_extent_cachep: - extent_state_free_cachep(); -free_free_space: - btrfs_free_space_exit(); -free_ctree: - btrfs_ctree_exit(); -free_transaction: - btrfs_transaction_exit(); -free_cachep: - btrfs_destroy_cachep(); -free_compress: - btrfs_exit_compress(); - btrfs_exit_sysfs(); - - return err; -} - -static void __exit exit_btrfs_fs(void) -{ - btrfs_free_space_exit(); - btrfs_ctree_exit(); - btrfs_transaction_exit(); - btrfs_destroy_cachep(); - btrfs_delayed_ref_exit(); - btrfs_auto_defrag_exit(); - btrfs_delayed_inode_exit(); - btrfs_prelim_ref_exit(); - ordered_data_exit(); - extent_map_exit(); - btrfs_bioset_exit(); - extent_state_free_cachep(); - extent_buffer_free_cachep(); - btrfs_interface_exit(); - unregister_filesystem(&btrfs_fs_type); - btrfs_exit_sysfs(); - btrfs_cleanup_fs_uuids(); - btrfs_exit_compress(); +error: + /* + * If we call exit_btrfs_fs() it would cause section mismatch. + * As init_btrfs_fs() belongs to .init.text, while exit_btrfs_fs() + * belongs to .exit.text. + */ + for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { + if (!mod_init_result[i]) + continue; + if (mod_init_seq[i].exit_func) + mod_init_seq[i].exit_func(); + mod_init_result[i] = false; + } + return ret; } late_initcall(init_btrfs_fs); From d549ff7bdbe72f5e0e934895090fed2647e9813b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:20:25 +0200 Subject: [PATCH 048/249] btrfs: add helper for bit enumeration Define helper macro that can be used in enum {} to utilize the automatic increment to define all bits without directly defining the values or using additional linear bits. 1. capture the sequence value, N 2. use the value to define the given enum with N-th bit set 3. reset the sequence back to N Use for enums that do not require fixed values for symbolic names (like for on-disk structures): enum { ENUM_BIT(FIRST), ENUM_BIT(SECOND), ENUM_BIT(THIRD) }; Where the values would be 0x1, 0x2 and 0x4. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/misc.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index f9850edfd726..69826ccd6644 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -10,6 +10,14 @@ #define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) +/* + * Enumerate bits using enum autoincrement. Define the @name as the n-th bit. + */ +#define ENUM_BIT(name) \ + __ ## name ## _BIT, \ + name = (1U << __ ## name ## _BIT), \ + __ ## name ## _SEQ = __ ## name ## _BIT + static inline void cond_wake_up(struct wait_queue_head *wq) { /* From c7321b76dfcc8a1b8d7dad3eff3595bad28252e2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:27:45 +0200 Subject: [PATCH 049/249] btrfs: convert BTRFS_ILOCK-* defines to enum bit Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c0c0dc8fa99f..5adf76d89f9b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ #include "async-thread.h" #include "block-rsv.h" #include "locking.h" +#include "misc.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -3131,9 +3132,11 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, extern const struct dentry_operations btrfs_dentry_operations; /* Inode locking type flags, by default the exclusive lock is taken */ -#define BTRFS_ILOCK_SHARED (1U << 0) -#define BTRFS_ILOCK_TRY (1U << 1) -#define BTRFS_ILOCK_MMAP (1U << 2) +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); From fd8d2951f478c095f5e65c5f14f2be4d4724c5ea Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:31:38 +0200 Subject: [PATCH 050/249] btrfs: convert extent_io page op defines to enum bits Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent_io.h | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 7929f054dda3..a5ec1475988f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -9,6 +9,7 @@ #include #include "compression.h" #include "ulist.h" +#include "misc.h" enum { EXTENT_BUFFER_UPTODATE, @@ -29,13 +30,15 @@ enum { }; /* these are flags for __process_pages_contig */ -#define PAGE_UNLOCK (1 << 0) -/* Page starts writeback, clear dirty bit and set writeback bit */ -#define PAGE_START_WRITEBACK (1 << 1) -#define PAGE_END_WRITEBACK (1 << 2) -#define PAGE_SET_ORDERED (1 << 3) -#define PAGE_SET_ERROR (1 << 4) -#define PAGE_LOCK (1 << 5) +enum { + ENUM_BIT(PAGE_UNLOCK), + /* Page starts writeback, clear dirty bit and set writeback bit */ + ENUM_BIT(PAGE_START_WRITEBACK), + ENUM_BIT(PAGE_END_WRITEBACK), + ENUM_BIT(PAGE_SET_ORDERED), + ENUM_BIT(PAGE_SET_ERROR), + ENUM_BIT(PAGE_LOCK), +}; /* * page->private values. Every page that is controlled by the extent From d3b4d0fd5518fd000938c6cbd4817085c210dcbe Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:34:58 +0200 Subject: [PATCH 051/249] btrfs: convert EXTENT_* bits to enums Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 71 +++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index c71aa29f719d..673637ed2fa9 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -3,43 +3,48 @@ #ifndef BTRFS_EXTENT_IO_TREE_H #define BTRFS_EXTENT_IO_TREE_H +#include "misc.h" + struct extent_changeset; struct io_failure_record; /* Bits for the extent state */ -#define EXTENT_DIRTY (1U << 0) -#define EXTENT_UPTODATE (1U << 1) -#define EXTENT_LOCKED (1U << 2) -#define EXTENT_NEW (1U << 3) -#define EXTENT_DELALLOC (1U << 4) -#define EXTENT_DEFRAG (1U << 5) -#define EXTENT_BOUNDARY (1U << 6) -#define EXTENT_NODATASUM (1U << 7) -#define EXTENT_CLEAR_META_RESV (1U << 8) -#define EXTENT_NEED_WAIT (1U << 9) -#define EXTENT_NORESERVE (1U << 11) -#define EXTENT_QGROUP_RESERVED (1U << 12) -#define EXTENT_CLEAR_DATA_RESV (1U << 13) -/* - * Must be cleared only during ordered extent completion or on error paths if we - * did not manage to submit bios and create the ordered extents for the range. - * Should not be cleared during page release and page invalidation (if there is - * an ordered extent in flight), that is left for the ordered extent completion. - */ -#define EXTENT_DELALLOC_NEW (1U << 14) -/* - * When an ordered extent successfully completes for a region marked as a new - * delalloc range, use this flag when clearing a new delalloc range to indicate - * that the VFS' inode number of bytes should be incremented and the inode's new - * delalloc bytes decremented, in an atomic way to prevent races with stat(2). - */ -#define EXTENT_ADD_INODE_BYTES (1U << 15) - -/* - * Set during truncate when we're clearing an entire range and we just want the - * extent states to go away. - */ -#define EXTENT_CLEAR_ALL_BITS (1U << 16) +enum { + ENUM_BIT(EXTENT_DIRTY), + ENUM_BIT(EXTENT_UPTODATE), + ENUM_BIT(EXTENT_LOCKED), + ENUM_BIT(EXTENT_NEW), + ENUM_BIT(EXTENT_DELALLOC), + ENUM_BIT(EXTENT_DEFRAG), + ENUM_BIT(EXTENT_BOUNDARY), + ENUM_BIT(EXTENT_NODATASUM), + ENUM_BIT(EXTENT_CLEAR_META_RESV), + ENUM_BIT(EXTENT_NEED_WAIT), + ENUM_BIT(EXTENT_NORESERVE), + ENUM_BIT(EXTENT_QGROUP_RESERVED), + ENUM_BIT(EXTENT_CLEAR_DATA_RESV), + /* + * Must be cleared only during ordered extent completion or on error + * paths if we did not manage to submit bios and create the ordered + * extents for the range. Should not be cleared during page release + * and page invalidation (if there is an ordered extent in flight), + * that is left for the ordered extent completion. + */ + ENUM_BIT(EXTENT_DELALLOC_NEW), + /* + * When an ordered extent successfully completes for a region marked as + * a new delalloc range, use this flag when clearing a new delalloc + * range to indicate that the VFS' inode number of bytes should be + * incremented and the inode's new delalloc bytes decremented, in an + * atomic way to prevent races with stat(2). + */ + ENUM_BIT(EXTENT_ADD_INODE_BYTES), + /* + * Set during truncate when we're clearing an entire range and we just + * want the extent states to go away. + */ + ENUM_BIT(EXTENT_CLEAR_ALL_BITS), +}; #define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ EXTENT_CLEAR_DATA_RESV) From e0a8b9a74767f748881b1e2408ed9afe7a26d9fd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:37:35 +0200 Subject: [PATCH 052/249] btrfs: convert QGROUP_* defines to enum bits The defines/enums are used only for tracepoints and are not part of the on-disk format. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/qgroup.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 578c77e94200..3fb5459c9309 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -11,6 +11,7 @@ #include #include "ulist.h" #include "delayed-ref.h" +#include "misc.h" /* * Btrfs qgroup overview @@ -242,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid) /* * For qgroup event trace points only */ -#define QGROUP_RESERVE (1<<0) -#define QGROUP_RELEASE (1<<1) -#define QGROUP_FREE (1<<2) +enum { + ENUM_BIT(QGROUP_RESERVE), + ENUM_BIT(QGROUP_RELEASE), + ENUM_BIT(QGROUP_FREE), +}; int btrfs_quota_enable(struct btrfs_fs_info *fs_info); int btrfs_quota_disable(struct btrfs_fs_info *fs_info); From cc37ea61920eca87316be1ae255e5e5e7b12f7b6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 9 Sep 2022 17:43:06 +0200 Subject: [PATCH 053/249] btrfs: convert __TRANS_* defines to enum bits The base transaction bits can be defined as bits in a contiguous sequence, although right now there's a hole from bit 1 to 8. The bits are used for btrfs_trans_handle::type, and there's another set of TRANS_STATE_* defines that are for btrfs_transaction::state. They are mutually exclusive though the hole in the sequence looks like was made for the states. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/transaction.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 1332f193b800..b39ebf98af60 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -10,6 +10,7 @@ #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" +#include "misc.h" enum btrfs_trans_state { TRANS_STATE_RUNNING, @@ -98,14 +99,15 @@ struct btrfs_transaction { struct list_head releasing_ebs; }; -#define __TRANS_FREEZABLE (1U << 0) - -#define __TRANS_START (1U << 9) -#define __TRANS_ATTACH (1U << 10) -#define __TRANS_JOIN (1U << 11) -#define __TRANS_JOIN_NOLOCK (1U << 12) -#define __TRANS_DUMMY (1U << 13) -#define __TRANS_JOIN_NOSTART (1U << 14) +enum { + ENUM_BIT(__TRANS_FREEZABLE), + ENUM_BIT(__TRANS_START), + ENUM_BIT(__TRANS_ATTACH), + ENUM_BIT(__TRANS_JOIN), + ENUM_BIT(__TRANS_JOIN_NOLOCK), + ENUM_BIT(__TRANS_DUMMY), + ENUM_BIT(__TRANS_JOIN_NOSTART), +}; #define TRANS_START (__TRANS_START | __TRANS_FREEZABLE) #define TRANS_ATTACH (__TRANS_ATTACH) From 7248e0cebbefaba94c0c37f708a134dad3acba0e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 9 Sep 2022 14:45:22 +0800 Subject: [PATCH 054/249] btrfs: skip update of block group item if used bytes are the same [BACKGROUND] When committing a transaction, we will update block group items for all dirty block groups. But in fact, dirty block groups don't always need to update their block group items. It's pretty common to have a metadata block group which experienced several COW operations, but still have the same amount of used bytes. In that case, we may unnecessarily COW a tree block doing nothing. [ENHANCEMENT] This patch will introduce btrfs_block_group::commit_used member to remember the last used bytes, and use that new member to skip unnecessary block group item update. This would be more common for large filesystems, where metadata block group can be as large as 1GiB, containing at most 64K metadata items. In that case, if COW added and then deleted one metadata item near the end of the block group, then it's completely possible we don't need to touch the block group item at all. [BENCHMARK] The change itself can have quite a high chance (20~80%) to skip block group item updates in lot of workloads. As a result, it would result shorter time spent on btrfs_write_dirty_block_groups(), and overall reduce the execution time of the critical section of btrfs_commit_transaction(). Here comes a fio command, which will do random writes in 4K block size, causing a very heavy metadata updates. fio --filename=$mnt/file --size=512M --rw=randwrite --direct=1 --bs=4k \ --ioengine=libaio --iodepth=64 --runtime=300 --numjobs=4 \ --name=random_write --fallocate=none --time_based --fsync_on_close=1 The file size (512M) and number of threads (4) means 2GiB file size in total, but during the full 300s run time, my dedicated SATA SSD is able to write around 20~25GiB, which is over 10 times the file size. Thus after we fill the initial 2G, we should not cause much block group item updates. Please note, the fio numbers by themselves don't have much change, but if we look deeper, there is some reduced execution time, especially for the critical section of btrfs_commit_transaction(). I added extra trace_printk() to measure the following per-transaction execution time: - Critical section of btrfs_commit_transaction() By re-using the existing update_commit_stats() function, which has already calculated the interval correctly. - The while() loop for btrfs_write_dirty_block_groups() Although this includes the execution time of btrfs_run_delayed_refs(), it should still be representative overall. Both result involves transid 7~30, the same amount of transaction committed. The result looks like this: | Before | After | Diff ----------------------+-------------------+----------------+-------- Transaction interval | 229247198.5 | 215016933.6 | -6.2% Block group interval | 23133.33333 | 18970.83333 | -18.0% The change in block group item updates is more obvious, as skipped block group item updates also mean less delayed refs. And the overall execution time for that block group update loop is pretty small, thus we can assume the extent tree is already mostly cached. If we can skip an uncached tree block, it would cause more obvious change. Unfortunately the overall reduction in commit transaction critical section is much smaller, as the block group item updates loop is not really the major part, at least not for the above fio script. But still we have a observable reduction in the critical section. Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 28 +++++++++++++++++++++++++++- fs/btrfs/block-group.h | 6 ++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index b1f4acdeafb9..47461fd21975 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2071,6 +2071,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); + cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); @@ -2762,6 +2763,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_block_group_item bgi; struct btrfs_key key; + u64 old_commit_used; + u64 used; + + /* + * Block group items update can be triggered out of commit transaction + * critical section, thus we need a consistent view of used bytes. + * We cannot use cache->used directly outside of the spin lock, as it + * may be changed. + */ + spin_lock(&cache->lock); + old_commit_used = cache->commit_used; + used = cache->used; + /* No change in used bytes, can safely skip it. */ + if (cache->commit_used == used) { + spin_unlock(&cache->lock); + return 0; + } + cache->commit_used = used; + spin_unlock(&cache->lock); key.objectid = cache->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2776,7 +2796,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; bi = btrfs_item_ptr_offset(leaf, path->slots[0]); - btrfs_set_stack_block_group_used(&bgi, cache->used); + btrfs_set_stack_block_group_used(&bgi, used); btrfs_set_stack_block_group_chunk_objectid(&bgi, cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); @@ -2784,6 +2804,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); fail: btrfs_release_path(path); + /* We didn't update the block group item, need to revert @commit_used. */ + if (ret < 0) { + spin_lock(&cache->lock); + cache->commit_used = old_commit_used; + spin_unlock(&cache->lock); + } return ret; } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 39e79bed10ae..6c970a486b68 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -99,6 +99,12 @@ struct btrfs_block_group { u64 cache_generation; u64 global_root_id; + /* + * The last committed used bytes of this block group, if the above @used + * is still the same as @commit_used, we don't need to update block + * group item of this block group. + */ + u64 commit_used; /* * If the free space extent count exceeds this number, convert the block * group to bitmaps. From 48acc47d7813a0e650754845161f04b0b27ff8ac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 10:00:39 -0400 Subject: [PATCH 055/249] btrfs: do not use GFP_ATOMIC in the read endio We have done read endio in an async thread for a very, very long time, which makes the use of GFP_ATOMIC and unlock_extent_atomic() unneeded in our read endio path. We've noticed under heavy memory pressure in our fleet that we can fail these allocations, and then often trip a BUG_ON(!allocation), which isn't an ideal outcome. Begin to address this by simply not using GFP_ATOMIC, which will allow us to do things like actually allocate a extent state when doing set_extent_bits(UPTODATE) in the endio handler. End io handlers are not called in atomic context, besides we have been allocating failrec with GFP_NOFS so we'd notice there's a problem. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4e4f28387ace..78d7ea10621d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -897,9 +897,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) end_page_read(page, uptodate, offset, sectorsize); if (uptodate) set_extent_uptodate(&inode->io_tree, offset, - offset + sectorsize - 1, &cached, GFP_ATOMIC); - unlock_extent_atomic(&inode->io_tree, offset, offset + sectorsize - 1, - &cached); + offset + sectorsize - 1, &cached, GFP_NOFS); + unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, + &cached); } static void submit_data_read_repair(struct inode *inode, @@ -1103,7 +1103,7 @@ static void endio_readpage_release_extent(struct processed_extent *processed, * Now we don't have range contiguous to the processed range, release * the processed range now. */ - unlock_extent_atomic(tree, processed->start, processed->end, &cached); + unlock_extent(tree, processed->start, processed->end, &cached); update: /* Update processed to current range */ From da2a071b6f142f47072ea7b37e03d3ef317b8a4e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 10:00:40 -0400 Subject: [PATCH 056/249] btrfs: remove unused unlock_extent_atomic As of "btrfs: do not use GFP_ATOMIC in the read endio" we no longer have any users of unlock_extent_atomic, remove it. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 673637ed2fa9..d73ef24bad2e 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -145,13 +145,6 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, GFP_NOFS, NULL); } -static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, - GFP_ATOMIC, NULL); -} - static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits) { From 5a75034e71ef5ec0fce983afcb6c9cb0147cd5b9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Oct 2022 10:00:41 -0400 Subject: [PATCH 057/249] btrfs: do not panic if we can't allocate a prealloc extent state We sometimes have to allocate new extent states when clearing or setting new bits in an extent io tree. Generally we preallocate this before taking the tree spin lock, but we can use this preallocated extent state sometimes and then need to try to do a GFP_ATOMIC allocation under the lock. Unfortunately sometimes this fails, and then we hit the BUG_ON() and bring the box down. This happens roughly 20 times a week in our fleet. However the vast majority of callers use GFP_NOFS, which means that if this GFP_ATOMIC allocation fails, we could simply drop the spin lock, go back and allocate a new extent state with our given gfp mask, and begin again from where we left off. For the remaining callers that do not use GFP_NOFS, they are generally using GFP_NOWAIT, which still allows for some reclaim. So allow these allocations to attempt to happen outside of the spin lock so we don't need to rely on GFP_ATOMIC allocations. This in essence creates an infinite loop for anything that isn't GFP_NOFS. To address this we may want to migrate to using mempools for extent states so that we will always have emergency reserves in order to make our allocations. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index a630c771d25c..599db7b15574 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -572,7 +572,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY)) clear = 1; again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -636,7 +636,8 @@ hit_next: if (state->start < start) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, start); if (err) extent_io_tree_panic(tree, err); @@ -657,7 +658,8 @@ hit_next: */ if (state->start <= end && state->end > end) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) extent_io_tree_panic(tree, err); @@ -987,7 +989,7 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, else ASSERT(failed_start == NULL && failed_state == NULL); again: - if (!prealloc && gfpflags_allow_blocking(mask)) { + if (!prealloc) { /* * Don't care for allocation failure here because we might end * up not needing the pre-allocated extent state at all, which @@ -1012,7 +1014,8 @@ again: state = tree_search_for_insert(tree, start, &p, &parent); if (!state) { prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; prealloc->start = start; prealloc->end = end; insert_state_fast(tree, prealloc, p, parent, bits, changeset); @@ -1085,7 +1088,8 @@ hit_next: } prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, start); if (err) extent_io_tree_panic(tree, err); @@ -1122,7 +1126,8 @@ hit_next: this_end = last_start - 1; prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; /* * Avoid to free 'prealloc' if it can be merged with the later @@ -1154,7 +1159,8 @@ hit_next: } prealloc = alloc_extent_state_atomic(prealloc); - BUG_ON(!prealloc); + if (!prealloc) + goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) extent_io_tree_panic(tree, err); From 467761f90405004e58795116aa7d81f3d298dcfb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 30 Sep 2022 17:23:01 +0200 Subject: [PATCH 058/249] btrfs: sysfs: convert remaining scnprintf to sysfs_emit The sysfs_emit is the safe API for writing to the sysfs files, previously converted from scnprintf, there's one left to do in btrfs_read_policy_show. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 74fef1f49c35..910702df7767 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1160,16 +1160,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj, for (i = 0; i < BTRFS_NR_READ_POLICY; i++) { if (fs_devices->read_policy == i) - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]", + ret += sysfs_emit_at(buf, ret, "%s[%s]", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); else - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s", + ret += sysfs_emit_at(buf, ret, "%s%s", (ret == 0 ? "" : " "), btrfs_read_policy_name[i]); } - ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n"); + ret += sysfs_emit_at(buf, ret, "\n"); return ret; } From 63a7cb13071842966c1ce931edacbc23573aada5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 26 Jul 2022 20:54:10 +0200 Subject: [PATCH 059/249] btrfs: auto enable discard=async when possible There's a request to automatically enable async discard for capable devices. We can do that, the async mode is designed to wait for larger freed extents and is not intrusive, with limits to iops, kbps or latency. The status and tunables will be exported in /sys/fs/btrfs/FSID/discard . The automatic selection is done if there's at least one discard capable device in the filesystem (not capable devices are skipped). Mounting with any other discard option will honor that option, notably mounting with nodiscard will keep it disabled. Link: https://lore.kernel.org/linux-btrfs/CAEg-Je_b1YtdsCR0zS5XZ_SbvJgN70ezwvRwLiCZgDGLbeMB=w@mail.gmail.com/ Reviewed-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/disk-io.c | 14 ++++++++++++++ fs/btrfs/super.c | 2 ++ fs/btrfs/volumes.c | 3 +++ fs/btrfs/volumes.h | 2 ++ 5 files changed, 22 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5adf76d89f9b..3485f60efa33 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1396,6 +1396,7 @@ enum { BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), }; #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5705335c80c8..cf59d8f14a05 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3746,6 +3746,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); } + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable) { + btrfs_set_and_info(fs_info, DISCARD_ASYNC, + "auto enabling async discard"); + btrfs_clear_opt(fs_info->mount_opt, NODISCARD); + } + /* * Mount does not set all options immediately, we can do it now and do * not have to wait for transaction commit diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3a33dd9847d9..42e0d2fcc407 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -918,12 +918,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, ret = -EINVAL; goto out; } + btrfs_clear_opt(info->mount_opt, NODISCARD); break; case Opt_nodiscard: btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); btrfs_clear_and_info(info, DISCARD_ASYNC, "turning off async discard"); + btrfs_set_opt(info->mount_opt, NODISCARD); break; case Opt_space_cache: case Opt_space_cache_version: diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 635f45f1a2ef..796e9f5ff8f8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -641,6 +641,9 @@ static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices, if (!bdev_nonrot(bdev)) fs_devices->rotating = true; + if (bdev_max_discard_sectors(bdev)) + fs_devices->discardable = true; + device->bdev = bdev; clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); device->mode = flags; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 099def5613b8..a20ee7d57831 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -354,6 +354,8 @@ struct btrfs_fs_devices { * nonrot flag set */ bool rotating; + /* Devices support TRIM/discard commands */ + bool discardable; struct btrfs_fs_info *fs_info; /* sysfs kobjects */ From b307f06d37ca12495e4cfff8d456cd92f045f947 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 18 Oct 2022 16:06:38 +0200 Subject: [PATCH 060/249] btrfs: simplify generation check in btrfs_get_dentry Callers that pass non-zero generation always want to perform the generation check, we can simply encode that in one parameter and drop check_generation. Add function documentation. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/export.c | 23 +++++++++++++++++------ fs/btrfs/export.h | 3 +-- fs/btrfs/ioctl.c | 2 +- 3 files changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index fab7eb76e53b..a51a5dfa737c 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -57,9 +57,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, return type; } +/* + * Read dentry of inode with @objectid from filesystem root @root_objectid. + * + * @sb: the filesystem super block + * @objectid: inode objectid + * @root_objectid: object id of the subvolume root where to look up the inode + * @generation: optional, if not zero, verify that the found inode + * generation matches + * + * Return dentry alias for the inode, otherwise an error. In case the + * generation does not match return ESTALE. + */ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u64 generation, - int check_generation) + u64 root_objectid, u64 generation) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; @@ -77,7 +88,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (check_generation && generation != inode->i_generation) { + if (generation != 0 && generation != inode->i_generation) { iput(inode); return ERR_PTR(-ESTALE); } @@ -106,7 +117,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, objectid = fid->parent_objectid; generation = fid->parent_gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, @@ -128,7 +139,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, root_objectid = fid->root_objectid; generation = fid->gen; - return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1); + return btrfs_get_dentry(sb, objectid, root_objectid, generation); } struct dentry *btrfs_get_parent(struct dentry *child) @@ -188,7 +199,7 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (found_key.type == BTRFS_ROOT_BACKREF_KEY) { return btrfs_get_dentry(fs_info->sb, key.objectid, - found_key.offset, 0, 0); + found_key.offset, 0); } return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root)); diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h index 5afb7ca42828..eba6bc4f5a61 100644 --- a/fs/btrfs/export.h +++ b/fs/btrfs/export.h @@ -19,8 +19,7 @@ struct btrfs_fid { } __attribute__ ((packed)); struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, - u64 root_objectid, u64 generation, - int check_generation); + u64 root_objectid, u64 generation); struct dentry *btrfs_get_parent(struct dentry *child); #endif diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5ba2e810dc6e..a482739c1d82 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3274,7 +3274,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, dentry = btrfs_get_dentry(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, - vol_args2->subvolid, 0, 0); + vol_args2->subvolid, 0); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out_drop_write; From 875c627c5f2045e2b5f0ad6692799e8fb931f3cb Mon Sep 17 00:00:00 2001 From: Wang Yugui Date: Wed, 19 Oct 2022 16:10:01 +0800 Subject: [PATCH 061/249] btrfs: send add define for v2 buffer size Add a define for the data buffer size (though the maximum size is not limited by it) BTRFS_SEND_BUF_SIZE_V2 so it's more visible. Signed-off-by: Wang Yugui Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- fs/btrfs/send.h | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 1c4b693ee4a3..80104b5af32f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7901,7 +7901,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) if (sctx->proto >= 2) { u32 send_buf_num_pages; - sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2; sctx->send_buf = vmalloc(sctx->send_max_size); if (!sctx->send_buf) { ret = -ENOMEM; diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index f7585cfa7e52..4f5509cb1803 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -18,10 +18,12 @@ #endif /* - * In send stream v1, no command is larger than 64K. In send stream v2, no limit - * should be assumed. + * In send stream v1, no command is larger than 64K. In send stream v2, no + * limit should be assumed, the buffer size is set to be a header with + * compressed extent size. */ #define BTRFS_SEND_BUF_SIZE_V1 SZ_64K +#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE) struct inode; struct btrfs_ioctl_send_args; From c7f13d428ea1bfe883f2741a9b5a5352d595eb09 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:47 -0400 Subject: [PATCH 062/249] btrfs: move fs wide helpers out of ctree.h We have several fs wide related helpers in ctree.h. The bulk of these are the incompat flag test helpers, but there are things such as btrfs_fs_closing() and the read only helpers that also aren't directly related to the ctree code. Move these into a fs.h header, which will serve as the location for file system wide related helpers. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/backref.c | 1 + fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.h | 164 ------------------------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/fs.c | 92 +++++++++++++++++++++ fs/btrfs/fs.h | 85 +++++++++++++++++++ fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-checker.c | 1 + fs/btrfs/tree-log.c | 1 + fs/btrfs/uuid-tree.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/volumes.c | 1 + fs/btrfs/zoned.c | 1 + 26 files changed, 200 insertions(+), 165 deletions(-) create mode 100644 fs/btrfs/fs.c create mode 100644 fs/btrfs/fs.h diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index fa9ddcc9eb0b..eebb45c06485 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 232f49415ab9..f76db317c437 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -15,6 +15,7 @@ #include "locking.h" #include "misc.h" #include "tree-mod-log.h" +#include "fs.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 47461fd21975..bf8d1e110136 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -17,6 +17,7 @@ #include "discard.h" #include "raid56.h" #include "zoned.h" +#include "fs.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3485f60efa33..1b2a1dcb60dd 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2857,44 +2857,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *node, struct extent_buffer *parent); -static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) -{ - /* - * Do it this way so we only ever do one test_bit in the normal case. - */ - if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { - if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) - return 2; - return 1; - } - return 0; -} - -/* - * If we remount the fs to be R/O or umount the fs, the cleaner needn't do - * anything except sleeping. This function is used to check the status of - * the fs. - * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, - * since setting and checking for SB_RDONLY in the superblock's flags is not - * atomic. - */ -static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || - btrfs_fs_closing(fs_info); -} - -static inline void btrfs_set_sb_rdonly(struct super_block *sb) -{ - sb->s_flags |= SB_RDONLY; - set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -static inline void btrfs_clear_sb_rdonly(struct super_block *sb) -{ - sb->s_flags &= ~SB_RDONLY; - clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, @@ -3529,132 +3491,6 @@ do { \ } while (0) -/* compatibility and incompatibility defines */ - -#define btrfs_set_fs_incompat(__fs_info, opt) \ - __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "setting incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_incompat(__fs_info, opt) \ - __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, - u64 flag, const char* name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_incompat_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_incompat_flags(disk_super, features); - btrfs_info(fs_info, - "clearing incompat feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_incompat(fs_info, opt) \ - __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) - -static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_incompat_flags(disk_super) & flag); -} - -#define btrfs_set_fs_compat_ro(__fs_info, opt) \ - __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (!(features & flag)) { - features |= flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "setting compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ - __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, \ - #opt) - -static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, - u64 flag, const char *name) -{ - struct btrfs_super_block *disk_super; - u64 features; - - disk_super = fs_info->super_copy; - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - spin_lock(&fs_info->super_lock); - features = btrfs_super_compat_ro_flags(disk_super); - if (features & flag) { - features &= ~flag; - btrfs_set_super_compat_ro_flags(disk_super, features); - btrfs_info(fs_info, - "clearing compat-ro feature flag for %s (0x%llx)", - name, flag); - } - spin_unlock(&fs_info->super_lock); - } -} - -#define btrfs_fs_compat_ro(fs_info, opt) \ - __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) - -static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_compat_ro_flags(disk_super) & flag); -} - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index cf59d8f14a05..5e20b191b108 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -43,6 +43,7 @@ #include "space-info.h" #include "zoned.h" #include "subpage.h" +#include "fs.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2801c991814f..bc010dbcb6b1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -36,6 +36,7 @@ #include "rcu-string.h" #include "zoned.h" #include "dev-replace.h" +#include "fs.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 6bb9fa961a6a..824ff54d8155 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "print-tree.h" #include "compression.h" +#include "fs.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 19b41b5fe6c0..a352c7cacc99 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -30,6 +30,7 @@ #include "delalloc-space.h" #include "reflink.h" #include "subpage.h" +#include "fs.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 367bcfcf68f5..bfc21eb8ec63 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -11,6 +11,7 @@ #include "free-space-tree.h" #include "transaction.h" #include "block-group.h" +#include "fs.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c new file mode 100644 index 000000000000..d4ba948eba56 --- /dev/null +++ b/fs/btrfs/fs.c @@ -0,0 +1,92 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "ctree.h" +#include "fs.h" + +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "setting incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, + "clearing incompat feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "setting compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, + "clearing compat-ro feature flag for %s (0x%llx)", + name, flag); + } + spin_unlock(&fs_info->super_lock); + } +} diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h new file mode 100644 index 000000000000..8eda9ce0a904 --- /dev/null +++ b/fs/btrfs/fs.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FS_H +#define BTRFS_FS_H + +/* Compatibility and incompatibility defines */ +void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); +void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, + const char *name); + +#define btrfs_set_fs_incompat(__fs_info, opt) \ + __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) + +#define btrfs_fs_incompat(fs_info, opt) \ + __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt) + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_incompat_flags(disk_super) & flag); +} + +static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_compat_ro_flags(disk_super) & flag); +} + +static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) +{ + /* Do it this way so we only ever do one test_bit in the normal case. */ + if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) { + if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags)) + return 2; + return 1; + } + return 0; +} + +/* + * If we remount the fs to be R/O or umount the fs, the cleaner needn't do + * anything except sleeping. This function is used to check the status of + * the fs. + * We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount, + * since setting and checking for SB_RDONLY in the superblock's flags is not + * atomic. + */ +static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) || + btrfs_fs_closing(fs_info); +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50584b93a66f..60b3162c035d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -55,6 +55,7 @@ #include "zoned.h" #include "subpage.h" #include "inode-item.h" +#include "fs.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a482739c1d82..564a4ae9c285 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,6 +50,7 @@ #include "delalloc-space.h" #include "block-group.h" #include "subpage.h" +#include "fs.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index e04289347775..d2c699c9aa51 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -11,6 +11,7 @@ #include "xattr.h" #include "compression.h" #include "space-info.h" +#include "fs.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index b74105a10f16..e87a2f066f4d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -24,6 +24,7 @@ #include "block-group.h" #include "sysfs.h" #include "tree-mod-log.h" +#include "fs.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 216a4485d914..977afbb4cd45 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -28,6 +28,7 @@ #include "zoned.h" #include "inode-item.h" #include "space-info.h" +#include "fs.h" /* * Relocation overview diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index cdda4b2f20f2..6871c1c54f8c 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -21,6 +21,7 @@ #include "raid56.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index af2e133aaa5c..af1538c2685d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -10,6 +10,7 @@ #include "transaction.h" #include "block-group.h" #include "zoned.h" +#include "fs.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 42e0d2fcc407..ef646a1d9bc2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -49,6 +49,7 @@ #include "discard.h" #include "qgroup.h" #include "raid56.h" +#include "fs.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index ae7d4aca771d..bae77fb05e2b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -23,6 +23,7 @@ #include "block-group.h" #include "space-info.h" #include "zoned.h" +#include "fs.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43f905ab0a18..862d67798de5 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -25,6 +25,7 @@ #include "volumes.h" #include "misc.h" #include "btrfs_inode.h" +#include "fs.h" /* * Error message should follow the following format: diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c3cf3dabe0b1..e294c38f9b19 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "zoned.h" #include "inode-item.h" +#include "fs.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index b458452a1aaf..2d7eb290fb9c 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "disk-io.h" #include "print-tree.h" +#include "fs.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index ee00e33c309e..ab0b39badbbe 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "disk-io.h" #include "locking.h" +#include "fs.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 796e9f5ff8f8..d65d5d7835fe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -33,6 +33,7 @@ #include "block-group.h" #include "discard.h" #include "zoned.h" +#include "fs.h" static struct bio_set btrfs_bioset; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index c9e2b0c85309..2a7d856c232c 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -15,6 +15,7 @@ #include "transaction.h" #include "dev-replace.h" #include "space-info.h" +#include "fs.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 From e118578a8df7941a9bbc568851997852e5bc7338 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:48 -0400 Subject: [PATCH 063/249] btrfs: move assert helpers out of ctree.h These call functions that aren't defined in, or will be moved out of, ctree.h Move them to super.c where the other assert/error message code is defined. Drop the __noreturn attribute for btrfs_assertfail as objtool does not like it and fails with warnings like fs/btrfs/dir-item.o: warning: objtool: .text.unlikely: unexpected end of section fs/btrfs/xattr.o: warning: objtool: btrfs_setxattr() falls through to next function btrfs_setxattr_trans.cold() fs/btrfs/xattr.o: warning: objtool: .text.unlikely: unexpected end of section Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 18 +++--------------- fs/btrfs/super.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1b2a1dcb60dd..e5e4079c92d4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3333,18 +3333,11 @@ do { \ } while (0) #ifdef CONFIG_BTRFS_ASSERT -__cold __noreturn -static inline void assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} +void __cold btrfs_assertfail(const char *expr, const char *file, int line); #define ASSERT(expr) \ - (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) - + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) #else -static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif @@ -3404,12 +3397,7 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -__cold -static inline void btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); __printf(5, 6) __cold diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ef646a1d9bc2..06661bcb991b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -305,6 +305,20 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, } #endif +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line) +{ + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +} +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + #if BITS_PER_LONG == 32 void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) { From 9b569ea0be6fb27a4985acc9325896a3edc95ede Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:49 -0400 Subject: [PATCH 064/249] btrfs: move the printk helpers out of ctree.h We have a bunch of printk helpers that are in ctree.h. These have nothing to do with ctree.c, so move them into their own header. Subsequent patches will cleanup the printk helpers. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.h | 1 + fs/btrfs/check-integrity.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 249 ---------------------------------- fs/btrfs/delalloc-space.c | 1 + fs/btrfs/delayed-inode.c | 1 + fs/btrfs/delayed-ref.c | 1 + fs/btrfs/dir-item.c | 1 + fs/btrfs/extent-io-tree.c | 1 + fs/btrfs/extent_map.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/fs.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/lzo.c | 1 + fs/btrfs/messages.h | 259 ++++++++++++++++++++++++++++++++++++ fs/btrfs/ordered-data.c | 1 + fs/btrfs/print-tree.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/raid56.c | 1 + fs/btrfs/ref-verify.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/root-tree.c | 1 + fs/btrfs/struct-funcs.c | 1 + fs/btrfs/subpage.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/sysfs.c | 2 +- fs/btrfs/tree-checker.c | 1 + fs/btrfs/tree-log.h | 1 + fs/btrfs/tree-mod-log.c | 1 + fs/btrfs/ulist.c | 1 + fs/btrfs/uuid-tree.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/xattr.c | 1 + fs/btrfs/zoned.h | 1 + 36 files changed, 293 insertions(+), 250 deletions(-) create mode 100644 fs/btrfs/messages.h diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 6dac462430b0..fa3c9cbf9ae7 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -7,6 +7,7 @@ #define BTRFS_BACKREF_H #include +#include "messages.h" #include "ulist.h" #include "disk-io.h" #include "extent_io.h" diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 98c6e5feab19..e8e1a92b30ac 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -82,6 +82,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d645933ef135..f423b86f5262 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -8,6 +8,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e5e4079c92d4..d27d059e4d59 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3168,179 +3168,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); -static inline __printf(2, 3) __cold -void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ -} - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_printk(fs_info, fmt, args...) \ -do { \ - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ - _btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#elif defined(CONFIG_PRINTK) - -#define btrfs_printk(fs_info, fmt, args...) \ - _btrfs_printk(fs_info, fmt, ##args) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#else - -#define btrfs_printk(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, fmt, ##args) -#endif - -#define btrfs_emerg(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use printk_in_rcu - */ -#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk_in_rcu - */ -#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) - -/* - * Wrappers that use a ratelimited printk - */ -#define btrfs_emerg_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) -#define btrfs_alert_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) -#define btrfs_crit_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) -#define btrfs_err_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) -#define btrfs_warn_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) -#define btrfs_notice_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) -#define btrfs_info_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) - -#if defined(CONFIG_DYNAMIC_DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ - fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ - fs_info, KERN_DEBUG fmt, ##args) -#elif defined(DEBUG) -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) -#else -#define btrfs_debug(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ - btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) -#define btrfs_debug_rl(fs_info, fmt, args...) \ - btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) -#endif - -#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_no_printk(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ -do { \ - static DEFINE_RATELIMIT_STATE(_rs, \ - DEFAULT_RATELIMIT_INTERVAL, \ - DEFAULT_RATELIMIT_BURST); \ - if (__ratelimit(&_rs)) \ - btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ -do { \ - rcu_read_lock(); \ - btrfs_printk_ratelimited(fs_info, fmt, ##args); \ - rcu_read_unlock(); \ -} while (0) - -#ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line); - -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) -#else -#define ASSERT(expr) (void)(expr) -#endif - #if BITS_PER_LONG == 32 #define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) /* @@ -3397,88 +3224,12 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); - -__printf(5, 6) -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); - -const char * __attribute_const__ btrfs_decode_error(int errno); - -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit); - -bool __cold abort_should_print_stack(int errno); - -/* - * Call btrfs_abort_transaction as early as possible when an error condition is - * detected, that way the exact stack trace is reported for some errors. - */ -#define btrfs_abort_transaction(trans, errno) \ -do { \ - bool first = false; \ - /* Report first abort since mount */ \ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ - &((trans)->fs_info->fs_state))) { \ - first = true; \ - if (WARN(abort_should_print_stack(errno), \ - KERN_DEBUG \ - "BTRFS: Transaction aborted (error %d)\n", \ - (errno))) { \ - /* Stack trace printed. */ \ - } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ - (errno)); \ - } \ - } \ - __btrfs_abort_transaction((trans), __func__, \ - __LINE__, (errno), first); \ -} while (0) - -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - printk_index_subsys_emit( \ - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ - KERN_CRIT, fmt); \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -#else - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args) - -#endif - #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ &(fs_info)->fs_state))) -__printf(5, 6) -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...); -/* - * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic - * will panic(). Otherwise we BUG() here. - */ -#define btrfs_panic(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ - BUG(); \ -} while (0) - - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 118b2e20b2e1..045545145a2b 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "ctree.h" #include "delalloc-space.h" #include "block-rsv.h" diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a411f04a7b97..8cf5ee646147 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,6 +6,7 @@ #include #include +#include "messages.h" #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 36a3debe9493..c775ff4f1cb1 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -6,6 +6,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 72fb2c518a2b..be5c1c2a8da5 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -3,6 +3,7 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 599db7b15574..2cdff74ff033 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -2,6 +2,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "extent-io-tree.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 715979807ae1..f97508afb659 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -3,6 +3,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "volumes.h" #include "extent_map.h" diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 824ff54d8155..675987e2d652 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,6 +9,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bd68fafcccc7..83d866f5ab6c 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "free-space-cache.h" diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index bfc21eb8ec63..026214d74a02 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index d4ba948eba56..a59504b59435 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "ctree.h" #include "fs.h" diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 366f3a788c6a..b301d8e3df87 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -3,6 +3,7 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "inode-item.h" #include "disk-io.h" diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 89bc5f825e0a..6751874a3e69 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -13,6 +13,7 @@ #include #include #include +#include "messages.h" #include "compression.h" #include "ctree.h" diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h new file mode 100644 index 000000000000..ace5bb02820a --- /dev/null +++ b/fs/btrfs/messages.h @@ -0,0 +1,259 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_MESSAGES_H +#define BTRFS_MESSAGES_H + +#include + +struct btrfs_fs_info; +struct btrfs_trans_handle; + +static inline __printf(2, 3) __cold +void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ +} + +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#else + +#define btrfs_printk(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, fmt, ##args) +#endif + +#define btrfs_emerg(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use printk_in_rcu + */ +#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk_in_rcu + */ +#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args) + +/* + * Wrappers that use a ratelimited printk + */ +#define btrfs_emerg_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args) +#define btrfs_alert_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args) +#define btrfs_crit_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args) +#define btrfs_err_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args) +#define btrfs_warn_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args) +#define btrfs_notice_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args) +#define btrfs_info_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args) + +#if defined(CONFIG_DYNAMIC_DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \ + fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + _dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \ + fs_info, KERN_DEBUG fmt, ##args) +#elif defined(DEBUG) +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args) +#else +#define btrfs_debug(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \ + btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args) +#define btrfs_debug_rl(fs_info, fmt, args...) \ + btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args) +#endif + +#define btrfs_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_no_printk(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#define btrfs_printk_ratelimited(fs_info, fmt, args...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + if (__ratelimit(&_rs)) \ + btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \ +do { \ + rcu_read_lock(); \ + btrfs_printk_ratelimited(fs_info, fmt, ##args); \ + rcu_read_unlock(); \ +} while (0) + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line); + +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#else +#define ASSERT(expr) (void)(expr) +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info); + +__printf(5, 6) +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); + +const char * __attribute_const__ btrfs_decode_error(int errno); + +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit); + +bool __cold abort_should_print_stack(int errno); + +/* + * Call btrfs_abort_transaction as early as possible when an error condition is + * detected, that way the exact stack trace is reported for some errors. + */ +#define btrfs_abort_transaction(trans, errno) \ +do { \ + bool first = false; \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((trans)->fs_info->fs_state))) { \ + first = true; \ + if (WARN(abort_should_print_stack(errno), \ + KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno))) { \ + /* Stack trace printed. */ \ + } else { \ + btrfs_debug((trans)->fs_info, \ + "Transaction aborted (error %d)", \ + (errno)); \ + } \ + } \ + __btrfs_abort_transaction((trans), __func__, \ + __LINE__, (errno), first); \ +} while (0) + +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ +} while (0) + +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + +__printf(5, 6) +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...); +/* + * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic + * will panic(). Otherwise we BUG() here. + */ +#define btrfs_panic(fs_info, errno, fmt, args...) \ +do { \ + __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \ + BUG(); \ +} while (0) + +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index de2b716d3e7b..1cbaacdc50da 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -7,6 +7,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "transaction.h" diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index dd8777872143..708facaede2c 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -3,6 +3,7 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "print-tree.h" diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index d2c699c9aa51..6d88bf0a9b17 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -4,6 +4,7 @@ */ #include +#include "messages.h" #include "props.h" #include "btrfs_inode.h" #include "transaction.h" diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index c009c0a2081e..349270537c2e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,6 +13,7 @@ #include #include #include +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index a248f46cfe72..f7535b8b62f5 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "disk-io.h" #include "locking.h" diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f50586ff85c8..6179864de6e7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -2,6 +2,7 @@ #include #include +#include "messages.h" #include "compression.h" #include "ctree.h" #include "delalloc-space.h" diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index e1f599d7a916..44c8c8ad0a16 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 12455b2b41de..6ba16c018d7f 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -5,6 +5,7 @@ #include +#include "messages.h" #include "ctree.h" static bool check_setget_bounds(const struct extent_buffer *eb, diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 9a176af847d7..dd46b978ac2c 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include "messages.h" #include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 06661bcb991b..7debbcfc5d47 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,6 +26,7 @@ #include #include #include +#include "messages.h" #include "delayed-inode.h" #include "ctree.h" #include "disk-io.h" diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 910702df7767..0b6bea00742c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -10,7 +10,7 @@ #include #include #include - +#include "messages.h" #include "ctree.h" #include "discard.h" #include "disk-io.h" diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 862d67798de5..fa9536110d69 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -18,6 +18,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "tree-checker.h" #include "disk-io.h" diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index aed1e05e9879..f5770829d075 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -6,6 +6,7 @@ #ifndef BTRFS_TREE_LOG_H #define BTRFS_TREE_LOG_H +#include "messages.h" #include "ctree.h" #include "transaction.h" diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 18b4699bcb11..41e5d9f789dc 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 +#include "messages.h" #include "tree-mod-log.h" #include "disk-io.h" diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 3374c9e9be67..f2f20c8d84aa 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -5,6 +5,7 @@ */ #include +#include "messages.h" #include "ulist.h" #include "ctree.h" diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 2d7eb290fb9c..190f752a2e10 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "ctree.h" #include "transaction.h" #include "disk-io.h" diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index ab0b39badbbe..35445855df4d 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -10,6 +10,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 5bb8d8c86311..d12903f01f83 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -12,6 +12,7 @@ #include #include #include +#include "messages.h" #include "ctree.h" #include "btrfs_inode.h" #include "transaction.h" diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 8bd16d40b7c6..f43990985d80 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -5,6 +5,7 @@ #include #include +#include "messages.h" #include "volumes.h" #include "disk-io.h" #include "block-group.h" From bbde07a40a13cc5a3483eaeb52e9bb3b61d2cf57 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:50 -0400 Subject: [PATCH 065/249] btrfs: push printk index code into their respective helpers The printk index work can be pushed into the printk helpers themselves, this allows us to further sanitize messages.h, removing the last include in the header itself. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/messages.h | 29 +---------------------------- fs/btrfs/super.c | 10 ++++++++++ 2 files changed, 11 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index ace5bb02820a..3772358f8d30 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -13,19 +13,7 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_printk(fs_info, fmt, args...) \ -do { \ - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ - _btrfs_printk(fs_info, fmt, ##args); \ -} while (0) - -__printf(2, 3) -__cold -void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); - -#elif defined(CONFIG_PRINTK) +#ifdef CONFIG_PRINTK #define btrfs_printk(fs_info, fmt, args...) \ _btrfs_printk(fs_info, fmt, ##args) @@ -223,25 +211,10 @@ do { \ __LINE__, (errno), first); \ } while (0) -#ifdef CONFIG_PRINTK_INDEX - -#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - printk_index_subsys_emit( \ - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ - KERN_CRIT, fmt); \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ -} while (0) - -#else - #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) -#endif - __printf(5, 6) __cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 7debbcfc5d47..3aeb2106e119 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -181,6 +181,12 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function const char *errstr; #endif +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", + KERN_CRIT, fmt); +#endif + /* * Special case: if the error is EROFS, and we're already * under SB_RDONLY, then it is safe here. @@ -273,6 +279,10 @@ void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, const char *type = logtypes[4]; struct ratelimit_state *ratelimit = &printk_limits[4]; +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + va_start(args, fmt); while ((kern_level = printk_get_level(fmt)) != 0) { From ec8eb376e271ed2b8724bf488f4c74d2746d5446 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:51 -0400 Subject: [PATCH 066/249] btrfs: move BTRFS_FS_STATE* definitions and helpers to fs.h We're going to use fs.h to hold fs wide related helpers and definitions, move the FS_STATE enum and related helpers to fs.h, and then update all files that need these definitions to include fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 46 --------------------------------- fs/btrfs/delalloc-space.c | 1 + fs/btrfs/delayed-inode.c | 3 ++- fs/btrfs/dev-replace.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/free-space-cache.c | 3 ++- fs/btrfs/fs.h | 49 ++++++++++++++++++++++++++++++++++++ fs/btrfs/inode-item.c | 3 ++- fs/btrfs/reflink.c | 3 ++- fs/btrfs/root-tree.c | 3 ++- fs/btrfs/sysfs.c | 1 + fs/btrfs/tests/btrfs-tests.c | 1 + fs/btrfs/xattr.c | 3 ++- 15 files changed, 68 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 65a4e5087215..c0615af0434f 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -23,6 +23,7 @@ #include #include "misc.h" #include "ctree.h" +#include "fs.h" #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f423b86f5262..7ecb658500ce 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -18,6 +18,7 @@ #include "qgroup.h" #include "tree-mod-log.h" #include "tree-checker.h" +#include "fs.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index d27d059e4d59..6f1eefbe3691 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -67,37 +67,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -/* - * Runtime (in-memory) states of filesystem - */ -enum { - /* Global indicator of serious filesystem errors */ - BTRFS_FS_STATE_ERROR, - /* - * Filesystem is being remounted, allow to skip some operations, like - * defrag - */ - BTRFS_FS_STATE_REMOUNTING, - /* Filesystem in RO mode */ - BTRFS_FS_STATE_RO, - /* Track if a transaction abort has been reported on this filesystem */ - BTRFS_FS_STATE_TRANS_ABORTED, - /* - * Bio operations should be blocked on this filesystem because a source - * or target device is being destroyed as part of a device replace - */ - BTRFS_FS_STATE_DEV_REPLACING, - /* The btrfs_fs_info created for self-tests */ - BTRFS_FS_STATE_DUMMY_FS_INFO, - - BTRFS_FS_STATE_NO_CSUMS, - - /* Indicates there was an error cleaning up a log tree. */ - BTRFS_FS_STATE_LOG_CLEANUP_ERROR, - - BTRFS_FS_STATE_COUNT -}; - #define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); @@ -3224,12 +3193,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ - &(fs_info)->fs_state))) -#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ - (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ - &(fs_info)->fs_state))) - /* acl.c */ #ifdef CONFIG_BTRFS_FS_POSIX_ACL struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); @@ -3327,15 +3290,6 @@ static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); -} -#else -static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) -{ - return 0; -} #endif static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 045545145a2b..605d8874a446 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -9,6 +9,7 @@ #include "transaction.h" #include "qgroup.h" #include "block-group.h" +#include "fs.h" /* * HOW DOES THIS WORK diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 8cf5ee646147..2f68570fbb53 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -6,12 +6,13 @@ #include #include +#include "ctree.h" +#include "fs.h" #include "messages.h" #include "misc.h" #include "delayed-inode.h" #include "disk-io.h" #include "transaction.h" -#include "ctree.h" #include "qgroup.h" #include "locking.h" #include "inode-item.h" diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61e58066b5fd..348aef453e69 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -23,6 +23,7 @@ #include "sysfs.h" #include "zoned.h" #include "block-group.h" +#include "fs.h" /* * Device replace overview diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 78d7ea10621d..aa2d60f683bb 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -30,6 +30,7 @@ #include "zoned.h" #include "block-group.h" #include "compression.h" +#include "fs.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 83d866f5ab6c..703902156f97 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,9 +11,10 @@ #include #include #include +#include "ctree.h" +#include "fs.h" #include "messages.h" #include "misc.h" -#include "ctree.h" #include "free-space-cache.h" #include "transaction.h" #include "disk-io.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 8eda9ce0a904..25487af14717 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,37 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +/* + * Runtime (in-memory) states of filesystem + */ +enum { + /* Global indicator of serious filesystem errors */ + BTRFS_FS_STATE_ERROR, + /* + * Filesystem is being remounted, allow to skip some operations, like + * defrag + */ + BTRFS_FS_STATE_REMOUNTING, + /* Filesystem in RO mode */ + BTRFS_FS_STATE_RO, + /* Track if a transaction abort has been reported on this filesystem */ + BTRFS_FS_STATE_TRANS_ABORTED, + /* + * Bio operations should be blocked on this filesystem because a source + * or target device is being destroyed as part of a device replace + */ + BTRFS_FS_STATE_DEV_REPLACING, + /* The btrfs_fs_info created for self-tests */ + BTRFS_FS_STATE_DUMMY_FS_INFO, + + BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT +}; + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -82,4 +113,22 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } +#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ + &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); +} +#else +static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) +{ + return 0; +} +#endif + #endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b301d8e3df87..25e9f1d65067 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -3,8 +3,9 @@ * Copyright (C) 2007 Oracle. All rights reserved. */ -#include "messages.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "inode-item.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 6179864de6e7..daf65bfad30e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -2,9 +2,10 @@ #include #include +#include "ctree.h" +#include "fs.h" #include "messages.h" #include "compression.h" -#include "ctree.h" #include "delalloc-space.h" #include "disk-io.h" #include "reflink.h" diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 44c8c8ad0a16..112b4bf3c3b8 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -5,8 +5,9 @@ #include #include -#include "messages.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "transaction.h" #include "disk-io.h" #include "print-tree.h" diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 0b6bea00742c..31fb6cb389b1 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -22,6 +22,7 @@ #include "block-group.h" #include "qgroup.h" #include "misc.h" +#include "fs.h" /* * Structure name Path diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d43cb5242fec..669fa7133a2f 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -16,6 +16,7 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../block-group.h" +#include "../fs.h" static struct vfsmount *test_mnt = NULL; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index d12903f01f83..b26c869f0226 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -12,8 +12,9 @@ #include #include #include -#include "messages.h" #include "ctree.h" +#include "fs.h" +#include "messages.h" #include "btrfs_inode.h" #include "transaction.h" #include "xattr.h" From 0d3a9cf8c3068136145b90841778e3f1ac13f22a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:52 -0400 Subject: [PATCH 067/249] btrfs: convert incompat and compat flag test helpers to macros These helpers use functions not defined in fs.h, they're simply accessors of the super block in fs_info, convert them to macros so that we don't have a weird dependency between fs.h and accessors.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 25487af14717..682542ed964f 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -44,6 +44,12 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, const char *name); +#define __btrfs_fs_incompat(fs_info, flags) \ + (!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags))) + +#define __btrfs_fs_compat_ro(fs_info, flags) \ + (!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags))) + #define btrfs_set_fs_incompat(__fs_info, opt) \ __btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt) @@ -62,20 +68,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) -static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_incompat_flags(disk_super) & flag); -} - -static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) -{ - struct btrfs_super_block *disk_super; - disk_super = fs_info->super_copy; - return !!(btrfs_super_compat_ro_flags(disk_super) & flag); -} - static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ From fc97a410bd781d00634194bfcddfd2be4185d954 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:53 -0400 Subject: [PATCH 068/249] btrfs: move mount option definitions to fs.h These are fs wide definitions and helpers, move them out of ctree.h and into fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 1 + fs/btrfs/ctree.h | 63 ------------------------------------------ fs/btrfs/delayed-ref.c | 1 + fs/btrfs/discard.c | 1 + fs/btrfs/fs.h | 63 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ref-verify.c | 1 + 6 files changed, 67 insertions(+), 63 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 89e3e7d1bff6..a48fa7ac74cc 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -7,6 +7,7 @@ #include "transaction.h" #include "block-group.h" #include "disk-io.h" +#include "fs.h" /* * HOW DO BLOCK RESERVES WORK diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f1eefbe3691..86af9d36fdf7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1328,69 +1328,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -/* - * Flags for mount options. - * - * Note: don't forget to add new options to btrfs_show_options() - */ -enum { - BTRFS_MOUNT_NODATASUM = (1UL << 0), - BTRFS_MOUNT_NODATACOW = (1UL << 1), - BTRFS_MOUNT_NOBARRIER = (1UL << 2), - BTRFS_MOUNT_SSD = (1UL << 3), - BTRFS_MOUNT_DEGRADED = (1UL << 4), - BTRFS_MOUNT_COMPRESS = (1UL << 5), - BTRFS_MOUNT_NOTREELOG = (1UL << 6), - BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), - BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), - BTRFS_MOUNT_NOSSD = (1UL << 9), - BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), - BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), - BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), - BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), - BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), - BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), - BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), - BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), - BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), - BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), - BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), - BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), - BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), - BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), - BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), - BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), - BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), - BTRFS_MOUNT_REF_VERIFY = (1UL << 27), - BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), - BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), - BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), - BTRFS_MOUNT_NODISCARD = (1UL << 31), -}; - -#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) -#define BTRFS_DEFAULT_MAX_INLINE (2048) - -#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) -#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) -#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) -#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ - BTRFS_MOUNT_##opt) - -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - /* * Requests for changes that need to be done during transaction commit. * diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index c775ff4f1cb1..010cc16297d8 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -13,6 +13,7 @@ #include "qgroup.h" #include "space-info.h" #include "tree-mod-log.h" +#include "fs.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e1b7bd927d69..51f0ef386046 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -11,6 +11,7 @@ #include "block-group.h" #include "discard.h" #include "free-space-cache.h" +#include "fs.h" /* * This contains the logic to handle async discard. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 682542ed964f..4799812a7351 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -34,6 +34,49 @@ enum { BTRFS_FS_STATE_COUNT }; +/* + * Flags for mount options. + * + * Note: don't forget to add new options to btrfs_show_options() + */ +enum { + BTRFS_MOUNT_NODATASUM = (1UL << 0), + BTRFS_MOUNT_NODATACOW = (1UL << 1), + BTRFS_MOUNT_NOBARRIER = (1UL << 2), + BTRFS_MOUNT_SSD = (1UL << 3), + BTRFS_MOUNT_DEGRADED = (1UL << 4), + BTRFS_MOUNT_COMPRESS = (1UL << 5), + BTRFS_MOUNT_NOTREELOG = (1UL << 6), + BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7), + BTRFS_MOUNT_SSD_SPREAD = (1UL << 8), + BTRFS_MOUNT_NOSSD = (1UL << 9), + BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10), + BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11), + BTRFS_MOUNT_SPACE_CACHE = (1UL << 12), + BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13), + BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14), + BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15), + BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16), + BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17), + BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18), + BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19), + BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20), + BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21), + BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22), + BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23), + BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24), + BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25), + BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26), + BTRFS_MOUNT_REF_VERIFY = (1UL << 27), + BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28), + BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29), + BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30), + BTRFS_MOUNT_NODISCARD = (1UL << 31), +}; + +#define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (2048) + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); @@ -68,6 +111,26 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_fs_compat_ro(fs_info, opt) \ __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt) +#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ + BTRFS_MOUNT_##opt) + +#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (!btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_set_opt(fs_info->mount_opt, opt); \ +} while (0) + +#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ +do { \ + if (btrfs_test_opt(fs_info, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ + btrfs_clear_opt(fs_info->mount_opt, opt); \ +} while (0) + static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index f7535b8b62f5..9b09dc50ba14 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -11,6 +11,7 @@ #include "locking.h" #include "delayed-ref.h" #include "ref-verify.h" +#include "fs.h" /* * Used to keep track the roots and number of refs each root has for a given From 7966a6b5959bbeb38b35d12b7a533c1dee8c432c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:54 -0400 Subject: [PATCH 069/249] btrfs: move fs_info::flags enum to fs.h These definitions are fs wide, take them out of ctree.h and put them in fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 68 ----------------------------------- fs/btrfs/fs.h | 68 +++++++++++++++++++++++++++++++++++ fs/btrfs/tests/qgroup-tests.c | 1 + fs/btrfs/tree-mod-log.c | 1 + 4 files changed, 70 insertions(+), 68 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 86af9d36fdf7..149c185f4314 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -280,69 +280,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -enum { - BTRFS_FS_CLOSING_START, - BTRFS_FS_CLOSING_DONE, - BTRFS_FS_LOG_RECOVERING, - BTRFS_FS_OPEN, - BTRFS_FS_QUOTA_ENABLED, - BTRFS_FS_UPDATE_UUID_TREE_GEN, - BTRFS_FS_CREATING_FREE_SPACE_TREE, - BTRFS_FS_BTREE_ERR, - BTRFS_FS_LOG1_ERR, - BTRFS_FS_LOG2_ERR, - BTRFS_FS_QUOTA_OVERRIDE, - /* Used to record internally whether fs has been frozen */ - BTRFS_FS_FROZEN, - /* - * Indicate that balance has been set up from the ioctl and is in the - * main phase. The fs_info::balance_ctl is initialized. - */ - BTRFS_FS_BALANCE_RUNNING, - - /* - * Indicate that relocation of a chunk has started, it's set per chunk - * and is toggled between chunks. - */ - BTRFS_FS_RELOC_RUNNING, - - /* Indicate that the cleaner thread is awake and doing something. */ - BTRFS_FS_CLEANER_RUNNING, - - /* - * The checksumming has an optimized version and is considered fast, - * so we don't need to offload checksums to workqueues. - */ - BTRFS_FS_CSUM_IMPL_FAST, - - /* Indicate that the discard workqueue can service discards. */ - BTRFS_FS_DISCARD_RUNNING, - - /* Indicate that we need to cleanup space cache v1 */ - BTRFS_FS_CLEANUP_SPACE_CACHE_V1, - - /* Indicate that we can't trust the free space tree for caching yet */ - BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, - - /* Indicate whether there are any tree modification log users */ - BTRFS_FS_TREE_MOD_LOG_USERS, - - /* Indicate that we want the transaction kthread to commit right now. */ - BTRFS_FS_COMMIT_TRANS, - - /* Indicate we have half completed snapshot deletions pending. */ - BTRFS_FS_UNFINISHED_DROPS, - - /* Indicate we have to finish a zone to do next allocation. */ - BTRFS_FS_NEED_ZONE_FINISH, - -#if BITS_PER_LONG == 32 - /* Indicate if we have error/warn message printed on 32bit systems */ - BTRFS_FS_32BIT_ERROR, - BTRFS_FS_32BIT_WARN, -#endif -}; - /* * Exclusive operations (device replace, resize, device add/remove, balance) */ @@ -1037,11 +974,6 @@ enum btrfs_lockdep_trans_states { &lock##_key, 0); \ } while (0) -static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) -{ - clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); -} - /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 4799812a7351..7337707d3939 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -34,6 +34,69 @@ enum { BTRFS_FS_STATE_COUNT }; +enum { + BTRFS_FS_CLOSING_START, + BTRFS_FS_CLOSING_DONE, + BTRFS_FS_LOG_RECOVERING, + BTRFS_FS_OPEN, + BTRFS_FS_QUOTA_ENABLED, + BTRFS_FS_UPDATE_UUID_TREE_GEN, + BTRFS_FS_CREATING_FREE_SPACE_TREE, + BTRFS_FS_BTREE_ERR, + BTRFS_FS_LOG1_ERR, + BTRFS_FS_LOG2_ERR, + BTRFS_FS_QUOTA_OVERRIDE, + /* Used to record internally whether fs has been frozen */ + BTRFS_FS_FROZEN, + /* + * Indicate that balance has been set up from the ioctl and is in the + * main phase. The fs_info::balance_ctl is initialized. + */ + BTRFS_FS_BALANCE_RUNNING, + + /* + * Indicate that relocation of a chunk has started, it's set per chunk + * and is toggled between chunks. + */ + BTRFS_FS_RELOC_RUNNING, + + /* Indicate that the cleaner thread is awake and doing something. */ + BTRFS_FS_CLEANER_RUNNING, + + /* + * The checksumming has an optimized version and is considered fast, + * so we don't need to offload checksums to workqueues. + */ + BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, + + /* Indicate that we need to cleanup space cache v1 */ + BTRFS_FS_CLEANUP_SPACE_CACHE_V1, + + /* Indicate that we can't trust the free space tree for caching yet */ + BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + + /* Indicate that we want the transaction kthread to commit right now. */ + BTRFS_FS_COMMIT_TRANS, + + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif +}; + /* * Flags for mount options. * @@ -168,6 +231,11 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 63676ea19f29..fd0e0ade8154 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -10,6 +10,7 @@ #include "../disk-io.h" #include "../qgroup.h" #include "../backref.h" +#include "../fs.h" static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 41e5d9f789dc..69083889a9b8 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -3,6 +3,7 @@ #include "messages.h" #include "tree-mod-log.h" #include "disk-io.h" +#include "fs.h" struct tree_mod_root { u64 logical; From c52cc7b7acfb3290a0268538b82ac7cf18df7ca4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:55 -0400 Subject: [PATCH 070/249] btrfs: add a BTRFS_FS_NEED_TRANS_COMMIT flag Currently we are only using fs_info->pending_changes to indicate that we need a transaction commit. The original users for this were removed years ago and we don't have more usage in sight, so this is the only remaining reason to have this field. Add a flag so we can remove this code. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/fs.h | 3 +++ fs/btrfs/super.c | 3 ++- fs/btrfs/sysfs.c | 4 ++-- fs/btrfs/transaction.c | 2 ++ 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 7337707d3939..bc5de4d598ad 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -90,6 +90,9 @@ enum { /* Indicate we have to finish a zone to do next allocation. */ BTRFS_FS_NEED_ZONE_FINISH, + /* Indicate that we want to commit the transaction. */ + BTRFS_FS_NEED_TRANS_COMMIT, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3aeb2106e119..0839e4a1cfac 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1535,7 +1535,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) * Exit unless we have some pending changes * that need to go through commit */ - if (fs_info->pending_changes == 0) + if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT, + &fs_info->flags)) return 0; /* * A non-blocking test if the fs is frozen. We must not diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 31fb6cb389b1..47b221372dcf 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -249,7 +249,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return count; @@ -960,7 +960,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, /* * We don't want to do full transaction commit from inside sysfs */ - btrfs_set_pending(fs_info, COMMIT); + set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); wake_up_process(fs_info->transaction_kthread); return len; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bae77fb05e2b..7b6b68ab089a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2104,6 +2104,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START); + clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags); + /* Stop the commit early if ->aborted is set */ if (TRANS_ABORTED(cur_trans)) { ret = cur_trans->aborted; From 55e5cfd36da5d71e21b72a5922c9b6c349744c2a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:56 -0400 Subject: [PATCH 071/249] btrfs: remove fs_info::pending_changes and related code Now that we're not using this code anywhere we can remove it as well as the member from fs_info. We don't have any mount options or on/off features that would utilize the pending infrastructure, the last one was inode_cache. There was a patchset [1] to enable some features from sysfs that would break things if it would be set immediately. In case we'll need that kind of logic again the patch can be reverted, but for the current use it can be replaced by the single state bit to do the commit. [1] https://lore.kernel.org/linux-btrfs/1422609654-19519-1-git-send-email-quwenruo@cn.fujitsu.com/ Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ add note ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 23 +---------------------- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/transaction.c | 25 ------------------------- fs/btrfs/transaction.h | 1 - 4 files changed, 1 insertion(+), 54 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 149c185f4314..c5f2f387989e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -373,11 +373,7 @@ struct btrfs_fs_info { */ u64 last_trans_log_full_commit; unsigned long mount_opt; - /* - * Track requests for actions that need to be done during transaction - * commit (like for some mount options). - */ - unsigned long pending_changes; + unsigned long compress_type:4; unsigned int compress_level; u32 commit_interval; @@ -1260,23 +1256,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -/* - * Requests for changes that need to be done during transaction commit. - * - * Internal mount options that are used for special handling of the real - * mount options (eg. cannot be set during remount and have to be set during - * transaction commit) - */ - -#define BTRFS_PENDING_COMMIT (0) - -#define btrfs_test_pending(info, opt) \ - test_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_set_pending(info, opt) \ - set_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) -#define btrfs_clear_pending(info, opt) \ - clear_bit(BTRFS_PENDING_##opt, &(info)->pending_changes) - struct btrfs_map_token { struct extent_buffer *eb; char *kaddr; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5e20b191b108..2c38b4eebd8f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3761,12 +3761,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_clear_opt(fs_info->mount_opt, NODISCARD); } - /* - * Mount does not set all options immediately, we can do it now and do - * not have to wait for transaction commit - */ - btrfs_apply_pending_changes(fs_info); - #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) { ret = btrfsic_mount(fs_info, fs_devices, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7b6b68ab089a..37d0baaa41d8 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -2359,12 +2359,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) if (ret) goto unlock_reloc; - /* - * Since the transaction is done, we can apply the pending changes - * before the next transaction. - */ - btrfs_apply_pending_changes(fs_info); - /* commit_fs_roots gets rid of all the tree log roots, it is now * safe to free the root of tree log roots */ @@ -2587,25 +2581,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info) -{ - unsigned long prev; - unsigned long bit; - - prev = xchg(&fs_info->pending_changes, 0); - if (!prev) - return; - - bit = 1 << BTRFS_PENDING_COMMIT; - if (prev & bit) - btrfs_debug(fs_info, "pending commit done"); - prev &= ~bit; - - if (prev) - btrfs_warn(fs_info, - "unknown pending changes left 0x%lx, ignoring", prev); -} - int __init btrfs_transaction_init(void) { btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle", diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index b39ebf98af60..97f6c39f59c8 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -233,7 +233,6 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark); int btrfs_transaction_blocked(struct btrfs_fs_info *info); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); void btrfs_put_transaction(struct btrfs_transaction *transaction); -void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info); void btrfs_add_dropped_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); From d83eb482b727e7d9fdf6f729db6da3642c4d8a58 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:57 -0400 Subject: [PATCH 072/249] btrfs: move the compat/incompat flag masks to fs.h This is fs wide information, move it out of ctree.h into fs.h. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 57 ------------------------------------------------ fs/btrfs/fs.h | 57 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 57 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c5f2f387989e..af6b72c7e33f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -78,63 +78,6 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); */ #define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) -/* - * Compat flags that we support. If any incompat flags are set other than the - * ones specified below then we will fail to mount - */ -#define BTRFS_FEATURE_COMPAT_SUPP 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL - -#define BTRFS_FEATURE_COMPAT_RO_SUPP \ - (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ - BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ - BTRFS_FEATURE_COMPAT_RO_VERITY | \ - BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) - -#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL - -#ifdef CONFIG_BTRFS_DEBUG -/* - * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG - */ -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED | \ - BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) -#else -#define BTRFS_FEATURE_INCOMPAT_SUPP \ - (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ - BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ - BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ - BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ - BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ - BTRFS_FEATURE_INCOMPAT_RAID56 | \ - BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ - BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ - BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ - BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ - BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ - BTRFS_FEATURE_INCOMPAT_ZONED) -#endif - -#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ - (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) -#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index bc5de4d598ad..af356feec0c7 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -140,6 +140,63 @@ enum { BTRFS_MOUNT_NODISCARD = (1UL << 31), }; +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \ + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \ + BTRFS_FEATURE_COMPAT_RO_VERITY | \ + BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE) + +#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL +#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED) +#endif + +#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ + (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) +#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL + #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) From 818fe33aed42ddd5052171328a3f708e98357e10 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:58 -0400 Subject: [PATCH 073/249] btrfs: rename struct-funcs.c to accessors.c Rename struct-funcs.c to accessors.c so we can move the item accessors out of ctree.h. accessors.c is a better description of the code that is contained in these files. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/{struct-funcs.c => accessors.c} | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) rename fs/btrfs/{struct-funcs.c => accessors.c} (99%) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index eebb45c06485..76f90dcfb14d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -24,7 +24,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o disk-io.o \ transaction.o inode.o file.o tree-defrag.o \ - extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/accessors.c similarity index 99% rename from fs/btrfs/struct-funcs.c rename to fs/btrfs/accessors.c index 6ba16c018d7f..118bfd1c0e3e 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/accessors.c @@ -4,7 +4,6 @@ */ #include - #include "messages.h" #include "ctree.h" From ad1ac5012c2beccc9de4d3e2cc1382fd250e5540 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:50:59 -0400 Subject: [PATCH 074/249] btrfs: move btrfs_map_token to accessors This is specific to the item-accessor code, move it out of ctree.h into accessor.h/.c and then update the users to include the new header file. This un-inlines btrfs_init_map_token, however this is only called once per function so it's not critical to be inlined. This also saves 904 bytes of code on a release build. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 8 ++++++++ fs/btrfs/accessors.h | 14 ++++++++++++++ fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 16 ++-------------- fs/btrfs/inode.c | 1 + fs/btrfs/tree-log.c | 1 + 6 files changed, 27 insertions(+), 14 deletions(-) create mode 100644 fs/btrfs/accessors.h diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 118bfd1c0e3e..7a7b7d263102 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -6,6 +6,7 @@ #include #include "messages.h" #include "ctree.h" +#include "accessors.h" static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) @@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb, return true; } +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) +{ + token->eb = eb; + token->kaddr = page_address(eb->pages[0]); + token->offset = 0; +} + /* * Macro templates that define helpers to read/write extent buffer data of a * given size, that are also used via ctree.h for access to item members by diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h new file mode 100644 index 000000000000..a47b0f9d30ca --- /dev/null +++ b/fs/btrfs/accessors.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACCESSORS_H +#define BTRFS_ACCESSORS_H + +struct btrfs_map_token { + struct extent_buffer *eb; + char *kaddr; + unsigned long offset; +}; + +void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); + +#endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7ecb658500ce..1283ca46c5bc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -19,6 +19,7 @@ #include "tree-mod-log.h" #include "tree-checker.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af6b72c7e33f..296ca4de8d52 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -52,6 +52,8 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; +struct btrfs_map_token; + #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1199,23 +1201,9 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); } -struct btrfs_map_token { - struct extent_buffer *eb; - char *kaddr; - unsigned long offset; -}; - #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -static inline void btrfs_init_map_token(struct btrfs_map_token *token, - struct extent_buffer *eb) -{ - token->eb = eb; - token->kaddr = page_address(eb->pages[0]); - token->offset = 0; -} - /* some macros to generate set/get functions for the struct fields. This * assumes there is a lefoo_to_cpu for every type, so lets make a simple * one for u8: diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 60b3162c035d..cb2a6d7f6252 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -56,6 +56,7 @@ #include "subpage.h" #include "inode-item.h" #include "fs.h" +#include "accessors.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e294c38f9b19..dc49f0aae1fb 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -22,6 +22,7 @@ #include "zoned.h" #include "inode-item.h" #include "fs.h" +#include "accessors.h" #define MAX_CONFLICT_INODES 10 From 07e81dc94474eb62705c6f96d9ab1a5a797b8703 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:51:00 -0400 Subject: [PATCH 075/249] btrfs: move accessor helpers into accessors.h This is a large patch, but because they're all macros it's impossible to split up. Simply copy all of the item accessors in ctree.h and paste them in accessors.h, and then update any files to include the header so everything compiles. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments, style fixups ] Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 1068 +++++++++++++++++++++++ fs/btrfs/backref.c | 1 + fs/btrfs/block-group.c | 1 + fs/btrfs/block-rsv.c | 1 + fs/btrfs/check-integrity.c | 1 + fs/btrfs/ctree.h | 1090 ------------------------ fs/btrfs/delayed-inode.c | 1 + fs/btrfs/dev-replace.c | 1 + fs/btrfs/dir-item.c | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/export.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/fs.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/locking.c | 1 + fs/btrfs/print-tree.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/ref-verify.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/root-tree.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/sysfs.c | 1 + fs/btrfs/tests/extent-buffer-tests.c | 1 + fs/btrfs/tests/free-space-tree-tests.c | 1 + fs/btrfs/tests/inode-tests.c | 1 + fs/btrfs/tests/qgroup-tests.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-checker.c | 1 + fs/btrfs/tree-defrag.c | 1 + fs/btrfs/tree-mod-log.c | 1 + fs/btrfs/uuid-tree.c | 2 +- fs/btrfs/verity.c | 1 + fs/btrfs/volumes.c | 1 + fs/btrfs/xattr.c | 1 + fs/btrfs/zoned.c | 1 + 46 files changed, 1112 insertions(+), 1091 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index a47b0f9d30ca..e8bbd4d80161 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -9,6 +9,1074 @@ struct btrfs_map_token { unsigned long offset; }; +#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) + void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); +/* + * Some macros to generate set/get functions for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple one + * for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + +#define read_eb_member(eb, ptr, type, member, result) (\ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) (\ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define DECLARE_BTRFS_SETGET_BITS(bits) \ +u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off); \ +void btrfs_set_token_##bits(struct btrfs_map_token *token, \ + const void *ptr, unsigned long off, \ + u##bits val); \ +u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ + const void *ptr, unsigned long off); \ +void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ + unsigned long off, u##bits val); + +DECLARE_BTRFS_SETGET_BITS(8) +DECLARE_BTRFS_SETGET_BITS(16) +DECLARE_BTRFS_SETGET_BITS(32) +DECLARE_BTRFS_SETGET_BITS(64) + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_##bits(eb, s, offsetof(type, member)); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ + u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_##bits(eb, s, offsetof(type, member), val); \ +} \ +static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ + const type *s) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + return btrfs_get_token_##bits(token, s, offsetof(type, member));\ +} \ +static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ + type *s, u##bits val) \ +{ \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ + btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ +} + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ +{ \ + const type *p = page_address(eb->pages[0]) + \ + offset_in_page(eb->start); \ + return get_unaligned_le##bits(&p->member); \ +} \ +static inline void btrfs_set_##name(const struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + put_unaligned_le##bits(val, &p->member); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(const type *s) \ +{ \ + return get_unaligned_le##bits(&s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + put_unaligned_le##bits(val, &s->member); \ +} + +static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, + total_bytes)); +} +static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, + struct btrfs_dev_item *s, + u64 val) +{ + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); + WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); + btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, int nr) +{ + unsigned long offset = (unsigned long)c; + + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_extref */ +BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, + parent_objectid, 64); +BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, + name_len, 16); +BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, + sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, + transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, + block_group, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); +BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); +BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); + +BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); + +static inline void btrfs_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, + struct btrfs_tree_block_info *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); +} + +BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, root, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, + objectid, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, + offset, 64); +BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, count, 32); + +BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, + type, 8); +BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, + offset, 64); + +static inline u32 btrfs_extent_inline_ref_size(int type) +{ + if (type == BTRFS_TREE_BLOCK_REF_KEY || + type == BTRFS_SHARED_BLOCK_REF_KEY) + return sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_SHARED_DATA_REF_KEY) + return sizeof(struct btrfs_shared_data_ref) + + sizeof(struct btrfs_extent_inline_ref); + if (type == BTRFS_EXTENT_DATA_REF_KEY) + return sizeof(struct btrfs_extent_data_ref) + + offsetof(struct btrfs_extent_inline_ref, offset); + return 0; +} + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, + generation, 64); + +static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +#define BTRFS_ITEM_SETGET_FUNCS(member) \ +static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ +{ \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ +} \ +static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ + int slot, u32 val) \ +{ \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ +} \ +static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ + int slot) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + return btrfs_token_raw_item_##member(token, item); \ +} \ +static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ + int slot, u32 val) \ +{ \ + struct btrfs_item *item = btrfs_item_nr(slot); \ + btrfs_set_token_raw_item_##member(token, item, val); \ +} + +BTRFS_ITEM_SETGET_FUNCS(offset) +BTRFS_ITEM_SETGET_FUNCS(size); + +static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); +} + +static inline void btrfs_item_key(const struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(nr); + + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(nr); + + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* struct btrfs_root_ref */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, + num_entries, 64); +BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, + num_bitmaps, 64); +BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, + generation, 64); + +static inline void btrfs_free_space_key(const struct extent_buffer *eb, + const struct btrfs_free_space_header *h, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +static inline void btrfs_set_free_space_key(struct extent_buffer *eb, + struct btrfs_free_space_header *h, + const struct btrfs_disk_key *key) +{ + write_eb_member(eb, h, struct btrfs_free_space_header, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +#ifdef __LITTLE_ENDIAN + +/* + * Optimized helpers for little-endian architectures where CPU and on-disk + * structures have the same endianness and we can skip conversions. + */ + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, + const struct btrfs_disk_key *disk_key) +{ + memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, + const struct btrfs_key *cpu_key) +{ + memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_node_key(eb, disk_key, nr); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *cpu_key, int nr) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_item_key(eb, disk_key, nr); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *cpu_key) +{ + struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; + + btrfs_dir_item_key(eb, item, disk_key); +} + +#else + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + const struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + const struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, + const struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +#endif + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); +BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); + +static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags | flag); +} + +static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + + btrfs_set_header_flags(eb, flags & ~flag); +} + +static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) +{ + u64 flags = btrfs_header_flags(eb); + + return flags >> BTRFS_BACKREF_REV_SHIFT; +} + +static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, int rev) +{ + u64 flags = btrfs_header_flags(eb); + + flags &= ~BTRFS_BACKREF_REV_MASK; + flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; + btrfs_set_header_flags(eb, flags); +} + +static inline int btrfs_is_leaf(const struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); +BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, + generation_v2, 64); +BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, ctransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); +BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); + +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + +/* struct btrfs_root_backup */ +BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, + tree_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, + tree_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, + tree_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, + chunk_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, + chunk_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, + extent_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, + fs_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, + fs_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, + fs_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, + dev_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, + dev_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, + dev_root_level, 8); + +BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, + csum_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, + csum_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, + csum_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, + num_devices, 64); + +/* struct btrfs_balance_item */ +BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); + +static inline void btrfs_balance_data(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_set_balance_data(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); +} + +static inline void btrfs_balance_meta(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_set_balance_meta(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); +} + +static inline void btrfs_balance_sys(const struct extent_buffer *eb, + const struct btrfs_balance_item *bi, + struct btrfs_disk_balance_args *ba) +{ + read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_set_balance_sys(struct extent_buffer *eb, + struct btrfs_balance_item *bi, + const struct btrfs_disk_balance_args *ba) +{ + write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); +} + +static inline void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, + const struct btrfs_disk_balance_args *disk) +{ + memset(cpu, 0, sizeof(*cpu)); + + cpu->profiles = le64_to_cpu(disk->profiles); + cpu->usage = le64_to_cpu(disk->usage); + cpu->devid = le64_to_cpu(disk->devid); + cpu->pstart = le64_to_cpu(disk->pstart); + cpu->pend = le64_to_cpu(disk->pend); + cpu->vstart = le64_to_cpu(disk->vstart); + cpu->vend = le64_to_cpu(disk->vend); + cpu->target = le64_to_cpu(disk->target); + cpu->flags = le64_to_cpu(disk->flags); + cpu->limit = le64_to_cpu(disk->limit); + cpu->stripes_min = le32_to_cpu(disk->stripes_min); + cpu->stripes_max = le32_to_cpu(disk->stripes_max); +} + +static inline void btrfs_cpu_balance_args_to_disk( + struct btrfs_disk_balance_args *disk, + const struct btrfs_balance_args *cpu) +{ + memset(disk, 0, sizeof(*disk)); + + disk->profiles = cpu_to_le64(cpu->profiles); + disk->usage = cpu_to_le64(cpu->usage); + disk->devid = cpu_to_le64(cpu->devid); + disk->pstart = cpu_to_le64(cpu->pstart); + disk->pend = cpu_to_le64(cpu->pend); + disk->vstart = cpu_to_le64(cpu->vstart); + disk->vend = cpu_to_le64(cpu->vend); + disk->target = cpu_to_le64(cpu->target); + disk->flags = cpu_to_le64(cpu->flags); + disk->limit = cpu_to_le64(cpu->limit); + disk->stripes_min = cpu_to_le32(cpu->stripes_min); + disk->stripes_max = cpu_to_le32(cpu->stripes_max); +} + +/* struct btrfs_super_block */ +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_ro_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); +BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, + cache_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); +BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, + uuid_tree_generation, 64); + +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, + type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, + struct btrfs_file_extent_item, disk_bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, + struct btrfs_file_extent_item, offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, + struct btrfs_file_extent_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, + struct btrfs_file_extent_item, num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, + struct btrfs_file_extent_item, ram_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, + struct btrfs_file_extent_item, disk_num_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, + struct btrfs_file_extent_item, compression, 8); + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* + * Returns the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* btrfs_qgroup_status_item */ +BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, + version, 64); +BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, + flags, 64); +BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, + rescan, 64); + +/* btrfs_qgroup_info_item */ +BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, + generation, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, + rfer_cmpr, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); +BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, + excl_cmpr, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, + struct btrfs_qgroup_info_item, generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, + rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, + struct btrfs_qgroup_info_item, rfer_cmpr, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, + excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, + struct btrfs_qgroup_info_item, excl_cmpr, 64); + +/* btrfs_qgroup_limit_item */ +BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, + max_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, + max_excl, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, + rsv_rfer, 64); +BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, + rsv_excl, 64); + +/* btrfs_dev_replace_item */ +BTRFS_SETGET_FUNCS(dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, + 64); +BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, + replace_state, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, + time_started, 64); +BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, + time_stopped, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, + num_write_errors, 64); +BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, num_uncorrectable_read_errors, + 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, + cursor_left, 64); +BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, + cursor_right, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, + struct btrfs_dev_replace_item, src_devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, + struct btrfs_dev_replace_item, + cont_reading_from_srcdev_mode, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, + struct btrfs_dev_replace_item, replace_state, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, + struct btrfs_dev_replace_item, time_started, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, + struct btrfs_dev_replace_item, time_stopped, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, + struct btrfs_dev_replace_item, num_write_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, + struct btrfs_dev_replace_item, + num_uncorrectable_read_errors, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, + struct btrfs_dev_replace_item, cursor_left, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, + struct btrfs_dev_replace_item, cursor_right, 64); + +/* btrfs_verity_descriptor_item */ +BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, + encryption, 8); +BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, + size, 64); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, + struct btrfs_verity_descriptor_item, encryption, 8); +BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, + struct btrfs_verity_descriptor_item, size, 64); + +/* Cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + #endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f76db317c437..41caa6fc6301 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -16,6 +16,7 @@ #include "misc.h" #include "tree-mod-log.h" #include "fs.h" +#include "accessors.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bf8d1e110136..d1f8d792b184 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -18,6 +18,7 @@ #include "raid56.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index a48fa7ac74cc..069ea9d6a8d4 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -8,6 +8,7 @@ #include "block-group.h" #include "disk-io.h" #include "fs.h" +#include "accessors.h" /* * HOW DO BLOCK RESERVES WORK diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index e8e1a92b30ac..7ff0703ef3e4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -93,6 +93,7 @@ #include "check-integrity.h" #include "rcu-string.h" #include "compression.h" +#include "accessors.h" #define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000 #define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000 diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 296ca4de8d52..20034c23abb2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1176,8 +1176,6 @@ static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) return info->nodesize - sizeof(struct btrfs_header); } -#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) - static inline u32 BTRFS_MAX_ITEM_SIZE(const struct btrfs_fs_info *info) { return BTRFS_LEAF_DATA_SIZE(info) - sizeof(struct btrfs_item); @@ -1204,1094 +1202,6 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_BYTES_TO_BLKS(fs_info, bytes) \ ((bytes) >> (fs_info)->sectorsize_bits) -/* some macros to generate set/get functions for the struct fields. This - * assumes there is a lefoo_to_cpu for every type, so lets make a simple - * one for u8: - */ -#define le8_to_cpu(v) (v) -#define cpu_to_le8(v) (v) -#define __le8 u8 - -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - -#define read_eb_member(eb, ptr, type, member, result) (\ - read_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define write_eb_member(eb, ptr, type, member, result) (\ - write_extent_buffer(eb, (char *)(result), \ - ((unsigned long)(ptr)) + \ - offsetof(type, member), \ - sizeof(((type *)0)->member))) - -#define DECLARE_BTRFS_SETGET_BITS(bits) \ -u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off); \ -void btrfs_set_token_##bits(struct btrfs_map_token *token, \ - const void *ptr, unsigned long off, \ - u##bits val); \ -u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ - const void *ptr, unsigned long off); \ -void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ - unsigned long off, u##bits val); - -DECLARE_BTRFS_SETGET_BITS(8) -DECLARE_BTRFS_SETGET_BITS(16) -DECLARE_BTRFS_SETGET_BITS(32) -DECLARE_BTRFS_SETGET_BITS(64) - -#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_##bits(eb, s, offsetof(type, member)); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ - u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_##bits(eb, s, offsetof(type, member), val); \ -} \ -static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ - const type *s) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - return btrfs_get_token_##bits(token, s, offsetof(type, member));\ -} \ -static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ - type *s, u##bits val) \ -{ \ - static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ - btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ -} - -#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ -{ \ - const type *p = page_address(eb->pages[0]) + \ - offset_in_page(eb->start); \ - return get_unaligned_le##bits(&p->member); \ -} \ -static inline void btrfs_set_##name(const struct extent_buffer *eb, \ - u##bits val) \ -{ \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ - put_unaligned_le##bits(val, &p->member); \ -} - -#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ -static inline u##bits btrfs_##name(const type *s) \ -{ \ - return get_unaligned_le##bits(&s->member); \ -} \ -static inline void btrfs_set_##name(type *s, u##bits val) \ -{ \ - put_unaligned_le##bits(val, &s->member); \ -} - -static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, - total_bytes)); -} -static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, - struct btrfs_dev_item *s, - u64 val) -{ - static_assert(sizeof(u64) == - sizeof(((struct btrfs_dev_item *)0))->total_bytes); - WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); - btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); -} - - -BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); -BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); -BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); -BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, - start_offset, 64); -BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); -BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); -BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); -BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); -BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, - dev_group, 32); -BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, - seek_speed, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, - bandwidth, 8); -BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, - generation, 64); - -static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid); -} - -static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d) -{ - return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid); -} - -BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); -BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); -BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); -BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); -BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); -BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); -BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); - -static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) -{ - return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); -} - -BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, - stripe_len, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, - io_align, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, - io_width, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, - sector_size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, - num_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, - sub_stripes, 16); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); - -static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, - int nr) -{ - unsigned long offset = (unsigned long)c; - offset += offsetof(struct btrfs_chunk, stripe); - offset += nr * sizeof(struct btrfs_stripe); - return (struct btrfs_stripe *)offset; -} - -static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); -} - -static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, - struct btrfs_chunk *c, int nr) -{ - return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); -} - -/* struct btrfs_block_group_item */ -BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_FUNCS(block_group_used, struct btrfs_block_group_item, - used, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); - -BTRFS_SETGET_FUNCS(block_group_chunk_objectid, - struct btrfs_block_group_item, chunk_objectid, 64); -BTRFS_SETGET_FUNCS(block_group_flags, - struct btrfs_block_group_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_block_group_flags, - struct btrfs_block_group_item, flags, 64); - -/* struct btrfs_free_space_info */ -BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, - extent_count, 32); -BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); - -/* struct btrfs_inode_ref */ -BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); -BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); - -/* struct btrfs_inode_extref */ -BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, - parent_objectid, 64); -BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref, - name_len, 16); -BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64); - -/* struct btrfs_inode_item */ -BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); -BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); -BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); -BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); -BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); -BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item, - sequence, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item, - transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item, - nbytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item, - block_group, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32); -BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64); -BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64); -BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64); -BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32); - -/* struct btrfs_dev_extent */ -BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, - chunk_tree, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, - chunk_objectid, 64); -BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, - chunk_offset, 64); -BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); -BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); -BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); - -BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8); - -static inline void btrfs_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -static inline void btrfs_set_tree_block_key(const struct extent_buffer *eb, - struct btrfs_tree_block_info *item, - struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_tree_block_info, key, key); -} - -BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref, - root, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref, - objectid, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref, - offset, 64); -BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref, - count, 32); - -BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref, - type, 8); -BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref, - offset, 64); - -static inline u32 btrfs_extent_inline_ref_size(int type) -{ - if (type == BTRFS_TREE_BLOCK_REF_KEY || - type == BTRFS_SHARED_BLOCK_REF_KEY) - return sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_SHARED_DATA_REF_KEY) - return sizeof(struct btrfs_shared_data_ref) + - sizeof(struct btrfs_extent_inline_ref); - if (type == BTRFS_EXTENT_DATA_REF_KEY) - return sizeof(struct btrfs_extent_data_ref) + - offsetof(struct btrfs_extent_inline_ref, offset); - return 0; -} - -/* struct btrfs_node */ -BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); -BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr, - blockptr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr, - generation, 64); - -static inline u64 btrfs_node_blockptr(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_blockptr(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline u64 btrfs_node_ptr_generation(const struct extent_buffer *eb, int nr) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); -} - -static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, - int nr, u64 val) -{ - unsigned long ptr; - ptr = offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; - btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); -} - -static inline unsigned long btrfs_node_key_ptr_offset(int nr) -{ - return offsetof(struct btrfs_node, ptrs) + - sizeof(struct btrfs_key_ptr) * nr; -} - -void btrfs_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr); - -static inline void btrfs_set_node_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); - write_eb_member(eb, (struct btrfs_key_ptr *)ptr, - struct btrfs_key_ptr, key, disk_key); -} - -/* struct btrfs_item */ -BTRFS_SETGET_FUNCS(raw_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); -BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); - -static inline unsigned long btrfs_item_nr_offset(int nr) -{ - return offsetof(struct btrfs_leaf, items) + - sizeof(struct btrfs_item) * nr; -} - -static inline struct btrfs_item *btrfs_item_nr(int nr) -{ - return (struct btrfs_item *)btrfs_item_nr_offset(nr); -} - -#define BTRFS_ITEM_SETGET_FUNCS(member) \ -static inline u32 btrfs_item_##member(const struct extent_buffer *eb, \ - int slot) \ -{ \ - return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ -} \ -static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ - int slot, u32 val) \ -{ \ - btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ -} \ -static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ - int slot) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - return btrfs_token_raw_item_##member(token, item); \ -} \ -static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ - int slot, u32 val) \ -{ \ - struct btrfs_item *item = btrfs_item_nr(slot); \ - btrfs_set_token_raw_item_##member(token, item, val); \ -} - -BTRFS_ITEM_SETGET_FUNCS(offset) -BTRFS_ITEM_SETGET_FUNCS(size); - -static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) -{ - return btrfs_item_offset(eb, nr) + btrfs_item_size(eb, nr); -} - -static inline void btrfs_item_key(const struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - read_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -static inline void btrfs_set_item_key(struct extent_buffer *eb, - struct btrfs_disk_key *disk_key, int nr) -{ - struct btrfs_item *item = btrfs_item_nr(nr); - write_eb_member(eb, item, struct btrfs_item, key, disk_key); -} - -BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); - -/* - * struct btrfs_root_ref - */ -BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); -BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); -BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); - -/* struct btrfs_dir_item */ -BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); -BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, - data_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, - name_len, 16); -BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, - transid, 64); - -static inline void btrfs_dir_item_key(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, - struct btrfs_dir_item *item, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, item, struct btrfs_dir_item, location, key); -} - -BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header, - num_entries, 64); -BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header, - num_bitmaps, 64); -BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header, - generation, 64); - -static inline void btrfs_free_space_key(const struct extent_buffer *eb, - const struct btrfs_free_space_header *h, - struct btrfs_disk_key *key) -{ - read_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -static inline void btrfs_set_free_space_key(struct extent_buffer *eb, - struct btrfs_free_space_header *h, - const struct btrfs_disk_key *key) -{ - write_eb_member(eb, h, struct btrfs_free_space_header, location, key); -} - -/* struct btrfs_disk_key */ -BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, - objectid, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); -BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); - -#ifdef __LITTLE_ENDIAN - -/* - * Optimized helpers for little-endian architectures where CPU and on-disk - * structures have the same endianness and we can skip conversions. - */ - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu_key, - const struct btrfs_disk_key *disk_key) -{ - memcpy(cpu_key, disk_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk_key, - const struct btrfs_key *cpu_key) -{ - memcpy(disk_key, cpu_key, sizeof(struct btrfs_key)); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_node_key(eb, disk_key, nr); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *cpu_key, int nr) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_item_key(eb, disk_key, nr); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *cpu_key) -{ - struct btrfs_disk_key *disk_key = (struct btrfs_disk_key *)cpu_key; - - btrfs_dir_item_key(eb, item, disk_key); -} - -#else - -static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, - const struct btrfs_disk_key *disk) -{ - cpu->offset = le64_to_cpu(disk->offset); - cpu->type = disk->type; - cpu->objectid = le64_to_cpu(disk->objectid); -} - -static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, - const struct btrfs_key *cpu) -{ - disk->offset = cpu_to_le64(cpu->offset); - disk->type = cpu->type; - disk->objectid = cpu_to_le64(cpu->objectid); -} - -static inline void btrfs_node_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_node_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_item_key_to_cpu(const struct extent_buffer *eb, - struct btrfs_key *key, int nr) -{ - struct btrfs_disk_key disk_key; - btrfs_item_key(eb, &disk_key, nr); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -static inline void btrfs_dir_item_key_to_cpu(const struct extent_buffer *eb, - const struct btrfs_dir_item *item, - struct btrfs_key *key) -{ - struct btrfs_disk_key disk_key; - btrfs_dir_item_key(eb, item, &disk_key); - btrfs_disk_key_to_cpu(key, &disk_key); -} - -#endif - -/* struct btrfs_header */ -BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); -BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); -BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); -BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); -BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64); -BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header, - nritems, 32); -BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64); - -static inline int btrfs_header_flag(const struct extent_buffer *eb, u64 flag) -{ - return (btrfs_header_flags(eb) & flag) == flag; -} - -static inline void btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags | flag); -} - -static inline void btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) -{ - u64 flags = btrfs_header_flags(eb); - btrfs_set_header_flags(eb, flags & ~flag); -} - -static inline int btrfs_header_backref_rev(const struct extent_buffer *eb) -{ - u64 flags = btrfs_header_flags(eb); - return flags >> BTRFS_BACKREF_REV_SHIFT; -} - -static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb, - int rev) -{ - u64 flags = btrfs_header_flags(eb); - flags &= ~BTRFS_BACKREF_REV_MASK; - flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT; - btrfs_set_header_flags(eb, flags); -} - -static inline int btrfs_is_leaf(const struct extent_buffer *eb) -{ - return btrfs_header_level(eb) == 0; -} - -/* struct btrfs_root_item */ -BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); - -BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(root_drop_level, struct btrfs_root_item, drop_level, 8); -BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); -BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); -BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); -BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); -BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); -BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, - last_snapshot, 64); -BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item, - generation_v2, 64); -BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item, - ctransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, - otransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, - stransid, 64); -BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, - rtransid, 64); - -static inline bool btrfs_root_readonly(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -static inline bool btrfs_root_dead(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; -} - -static inline u64 btrfs_root_id(const struct btrfs_root *root) -{ - return root->root_key.objectid; -} - -/* struct btrfs_root_backup */ -BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, - tree_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup, - tree_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup, - tree_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup, - chunk_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup, - chunk_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup, - extent_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup, - extent_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup, - extent_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup, - fs_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup, - fs_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup, - fs_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup, - dev_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup, - dev_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup, - dev_root_level, 8); - -BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup, - csum_root, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup, - csum_root_gen, 64); -BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup, - csum_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, - num_devices, 64); - -/* struct btrfs_balance_item */ -BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); - -static inline void btrfs_balance_data(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_set_balance_data(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, data, ba); -} - -static inline void btrfs_balance_meta(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_set_balance_meta(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba); -} - -static inline void btrfs_balance_sys(const struct extent_buffer *eb, - const struct btrfs_balance_item *bi, - struct btrfs_disk_balance_args *ba) -{ - read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void btrfs_set_balance_sys(struct extent_buffer *eb, - struct btrfs_balance_item *bi, - const struct btrfs_disk_balance_args *ba) -{ - write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba); -} - -static inline void -btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu, - const struct btrfs_disk_balance_args *disk) -{ - memset(cpu, 0, sizeof(*cpu)); - - cpu->profiles = le64_to_cpu(disk->profiles); - cpu->usage = le64_to_cpu(disk->usage); - cpu->devid = le64_to_cpu(disk->devid); - cpu->pstart = le64_to_cpu(disk->pstart); - cpu->pend = le64_to_cpu(disk->pend); - cpu->vstart = le64_to_cpu(disk->vstart); - cpu->vend = le64_to_cpu(disk->vend); - cpu->target = le64_to_cpu(disk->target); - cpu->flags = le64_to_cpu(disk->flags); - cpu->limit = le64_to_cpu(disk->limit); - cpu->stripes_min = le32_to_cpu(disk->stripes_min); - cpu->stripes_max = le32_to_cpu(disk->stripes_max); -} - -static inline void -btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk, - const struct btrfs_balance_args *cpu) -{ - memset(disk, 0, sizeof(*disk)); - - disk->profiles = cpu_to_le64(cpu->profiles); - disk->usage = cpu_to_le64(cpu->usage); - disk->devid = cpu_to_le64(cpu->devid); - disk->pstart = cpu_to_le64(cpu->pstart); - disk->pend = cpu_to_le64(cpu->pend); - disk->vstart = cpu_to_le64(cpu->vstart); - disk->vend = cpu_to_le64(cpu->vend); - disk->target = cpu_to_le64(cpu->target); - disk->flags = cpu_to_le64(cpu->flags); - disk->limit = cpu_to_le64(cpu->limit); - disk->stripes_min = cpu_to_le32(cpu->stripes_min); - disk->stripes_max = cpu_to_le32(cpu->stripes_max); -} - -/* struct btrfs_super_block */ -BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, - generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); -BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, - struct btrfs_super_block, sys_chunk_array_size, 32); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, - struct btrfs_super_block, chunk_root_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, - root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, - chunk_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, - chunk_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, - log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, - log_root_level, 8); -BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, - total_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, - bytes_used, 64); -BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, - sectorsize, 32); -BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, - nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, - stripesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, - root_dir_objectid, 64); -BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, - num_devices, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, - compat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, - compat_ro_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, - incompat_flags, 64); -BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, - csum_type, 16); -BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, - cache_generation, 64); -BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); -BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, - uuid_tree_generation, 64); - -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - - -/* - * The leaf data grows from end-to-front in the node. - * this returns the address of the start of the last item, - * which is the stop of the leaf data stack - */ -static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset(leaf, nr - 1); -} - -/* struct btrfs_file_extent_item */ -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, - type, 8); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr, - struct btrfs_file_extent_item, disk_bytenr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset, - struct btrfs_file_extent_item, offset, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation, - struct btrfs_file_extent_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes, - struct btrfs_file_extent_item, num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_ram_bytes, - struct btrfs_file_extent_item, ram_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, - struct btrfs_file_extent_item, disk_num_bytes, 64); -BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, - struct btrfs_file_extent_item, compression, 8); - -static inline unsigned long -btrfs_file_extent_inline_start(const struct btrfs_file_extent_item *e) -{ - return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; -} - -BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); -BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, - disk_bytenr, 64); -BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, - generation, 64); -BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, - disk_num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, - offset, 64); -BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, - num_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, - ram_bytes, 64); -BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, - compression, 8); -BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, - encryption, 8); -BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, - other_encoding, 16); - -/* - * this returns the number of bytes used by the item on disk, minus the - * size of any extent headers. If a file is compressed on disk, this is - * the compressed size - */ -static inline u32 btrfs_file_extent_inline_item_len( - const struct extent_buffer *eb, - int nr) -{ - return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -/* btrfs_qgroup_status_item */ -BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item, - version, 64); -BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item, - rescan, 64); - -/* btrfs_qgroup_info_item */ -BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item, - generation, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item, - rfer_cmpr, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64); -BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item, - excl_cmpr, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation, - struct btrfs_qgroup_info_item, generation, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item, - rfer, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr, - struct btrfs_qgroup_info_item, rfer_cmpr, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item, - excl, 64); -BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr, - struct btrfs_qgroup_info_item, excl_cmpr, 64); - -/* btrfs_qgroup_limit_item */ -BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item, - flags, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item, - max_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item, - max_excl, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, - rsv_rfer, 64); -BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, - rsv_excl, 64); - -/* btrfs_dev_replace_item */ -BTRFS_SETGET_FUNCS(dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode, - 64); -BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item, - replace_state, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item, - time_started, 64); -BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item, - time_stopped, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item, - num_write_errors, 64); -BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, num_uncorrectable_read_errors, - 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item, - cursor_left, 64); -BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item, - cursor_right, 64); - -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid, - struct btrfs_dev_replace_item, src_devid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode, - struct btrfs_dev_replace_item, - cont_reading_from_srcdev_mode, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state, - struct btrfs_dev_replace_item, replace_state, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started, - struct btrfs_dev_replace_item, time_started, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped, - struct btrfs_dev_replace_item, time_stopped, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors, - struct btrfs_dev_replace_item, num_write_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors, - struct btrfs_dev_replace_item, - num_uncorrectable_read_errors, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left, - struct btrfs_dev_replace_item, cursor_left, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right, - struct btrfs_dev_replace_item, cursor_right, 64); - -/* btrfs_verity_descriptor_item */ -BTRFS_SETGET_FUNCS(verity_descriptor_encryption, struct btrfs_verity_descriptor_item, - encryption, 8); -BTRFS_SETGET_FUNCS(verity_descriptor_size, struct btrfs_verity_descriptor_item, - size, 64); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_encryption, - struct btrfs_verity_descriptor_item, encryption, 8); -BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, - struct btrfs_verity_descriptor_item, size, 64); - -/* helper function to cast into the data area of the leaf. */ -#define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - -#define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + \ - btrfs_item_offset(leaf, slot))) - static inline u32 btrfs_crc32c(u32 crc, const void *address, unsigned length) { return crc32c(crc, address, length); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 2f68570fbb53..012c96de4701 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -17,6 +17,7 @@ #include "locking.h" #include "inode-item.h" #include "space-info.h" +#include "accessors.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 348aef453e69..94f8975034ce 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -24,6 +24,7 @@ #include "zoned.h" #include "block-group.h" #include "fs.h" +#include "accessors.h" /* * Device replace overview diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index be5c1c2a8da5..9fa37f245c43 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -7,6 +7,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" +#include "accessors.h" /* * insert a name into a directory, doing overflow properly if there is a hash diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2c38b4eebd8f..75475a213972 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,6 +44,7 @@ #include "zoned.h" #include "subpage.h" #include "fs.h" +#include "accessors.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index a51a5dfa737c..b6bc9684648f 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -7,6 +7,7 @@ #include "btrfs_inode.h" #include "print-tree.h" #include "export.h" +#include "accessors.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bc010dbcb6b1..8d28ba360e9a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -37,6 +37,7 @@ #include "zoned.h" #include "dev-replace.h" #include "fs.h" +#include "accessors.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index aa2d60f683bb..4b47bb8c590f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -31,6 +31,7 @@ #include "block-group.h" #include "compression.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 675987e2d652..bce6c8d31bc0 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -18,6 +18,7 @@ #include "print-tree.h" #include "compression.h" #include "fs.h" +#include "accessors.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a352c7cacc99..62360a554420 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -31,6 +31,7 @@ #include "reflink.h" #include "subpage.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 703902156f97..26ff1a5100b9 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -26,6 +26,7 @@ #include "discard.h" #include "subpage.h" #include "inode-item.h" +#include "accessors.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 026214d74a02..fc79d21e7b4f 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -13,6 +13,7 @@ #include "transaction.h" #include "block-group.h" #include "fs.h" +#include "accessors.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index a59504b59435..5553e1f8afe8 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -3,6 +3,7 @@ #include "messages.h" #include "ctree.h" #include "fs.h" +#include "accessors.h" void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name) diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 25e9f1d65067..b8dbabfa8b31 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -11,6 +11,7 @@ #include "transaction.h" #include "print-tree.h" #include "space-info.h" +#include "accessors.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, const char *name, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 564a4ae9c285..e33be4032fff 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -51,6 +51,7 @@ #include "block-group.h" #include "subpage.h" #include "fs.h" +#include "accessors.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 0eab3cb274a1..870528d87526 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -12,6 +12,7 @@ #include "ctree.h" #include "extent_io.h" #include "locking.h" +#include "accessors.h" /* * Lockdep class keys for extent_buffer->lock's in this root. For a given diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 708facaede2c..aab7d30eed55 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,7 @@ #include "ctree.h" #include "disk-io.h" #include "print-tree.h" +#include "accessors.h" struct root_name_map { u64 id; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 6d88bf0a9b17..9ad15d69718c 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -13,6 +13,7 @@ #include "compression.h" #include "space-info.h" #include "fs.h" +#include "accessors.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e87a2f066f4d..39f67ca4b4ca 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -25,6 +25,7 @@ #include "sysfs.h" #include "tree-mod-log.h" #include "fs.h" +#include "accessors.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 9b09dc50ba14..95d28497de7c 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -12,6 +12,7 @@ #include "delayed-ref.h" #include "ref-verify.h" #include "fs.h" +#include "accessors.h" /* * Used to keep track the roots and number of refs each root has for a given diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index daf65bfad30e..f0243eb33df7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -11,6 +11,7 @@ #include "reflink.h" #include "transaction.h" #include "subpage.h" +#include "accessors.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 977afbb4cd45..3d0a63465a74 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -29,6 +29,7 @@ #include "inode-item.h" #include "space-info.h" #include "fs.h" +#include "accessors.h" /* * Relocation overview diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 112b4bf3c3b8..fc00dfb281d5 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -13,6 +13,7 @@ #include "print-tree.h" #include "qgroup.h" #include "space-info.h" +#include "accessors.h" /* * Read a root item from the tree. In case we detect a root item smaller then diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6871c1c54f8c..5c6c4aa2fb09 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -22,6 +22,7 @@ #include "block-group.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 80104b5af32f..31fb3292e816 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -27,6 +27,7 @@ #include "compression.h" #include "xattr.h" #include "print-tree.h" +#include "accessors.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index af1538c2685d..94d73485454a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -11,6 +11,7 @@ #include "block-group.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0839e4a1cfac..c02fcdb9c7c4 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -51,6 +51,7 @@ #include "qgroup.h" #include "raid56.h" #include "fs.h" +#include "accessors.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 47b221372dcf..a14812f1254a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -23,6 +23,7 @@ #include "qgroup.h" #include "misc.h" #include "fs.h" +#include "accessors.h" /* * Structure name Path diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index b7d181a08eab..5ef0b90e25c3 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -8,6 +8,7 @@ #include "../ctree.h" #include "../extent_io.h" #include "../disk-io.h" +#include "../accessors.h" static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) { diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 13734ed43bfc..53a17b1d1744 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -10,6 +10,7 @@ #include "../free-space-tree.h" #include "../transaction.h" #include "../block-group.h" +#include "../accessors.h" struct free_space_extent { u64 start; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 625f7d398368..0a34a54ea9fd 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -11,6 +11,7 @@ #include "../extent_io.h" #include "../volumes.h" #include "../compression.h" +#include "../accessors.h" static void insert_extent(struct btrfs_root *root, u64 start, u64 len, u64 ram_bytes, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index fd0e0ade8154..65b65d55d1f6 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -11,6 +11,7 @@ #include "../qgroup.h" #include "../backref.h" #include "../fs.h" +#include "../accessors.h" static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 37d0baaa41d8..25e6b504edb4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -24,6 +24,7 @@ #include "space-info.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index fa9536110d69..11cafc520b47 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -27,6 +27,7 @@ #include "misc.h" #include "btrfs_inode.h" #include "fs.h" +#include "accessors.h" /* * Error message should follow the following format: diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index b6cf39f4e7e4..5df604846de6 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -9,6 +9,7 @@ #include "print-tree.h" #include "transaction.h" #include "locking.h" +#include "accessors.h" /* * Defrag all the leaves in a given btree. diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 69083889a9b8..3bea19698a10 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -4,6 +4,7 @@ #include "tree-mod-log.h" #include "disk-io.h" #include "fs.h" +#include "accessors.h" struct tree_mod_root { u64 logical; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 190f752a2e10..70304b89f31f 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -11,7 +11,7 @@ #include "disk-io.h" #include "print-tree.h" #include "fs.h" - +#include "accessors.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 35445855df4d..d02fa354fc2b 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -17,6 +17,7 @@ #include "disk-io.h" #include "locking.h" #include "fs.h" +#include "accessors.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d65d5d7835fe..767c56be6b96 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -34,6 +34,7 @@ #include "discard.h" #include "zoned.h" #include "fs.h" +#include "accessors.h" static struct bio_set btrfs_bioset; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b26c869f0226..fcf2d5f7e198 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -21,6 +21,7 @@ #include "disk-io.h" #include "props.h" #include "locking.h" +#include "accessors.h" int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 2a7d856c232c..fe66cb929af9 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -16,6 +16,7 @@ #include "dev-replace.h" #include "space-info.h" #include "fs.h" +#include "accessors.h" /* Maximum number of zones to report per blkdev_report_zones() call */ #define BTRFS_REPORT_NR_ZONES 4096 From e9c83077d2be815cb04f0b054e9494512b1657ee Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 19 Oct 2022 10:51:01 -0400 Subject: [PATCH 076/249] btrfs: remove temporary btrfs_map_token declaration in ctree.h This was added while I was moving this code to its new home, it can be removed now. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 20034c23abb2..3dcfb62af68a 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -52,8 +52,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -struct btrfs_map_token; - #define BTRFS_OLDEST_GENERATION 0ULL #define BTRFS_EMPTY_DIR_SIZE 0 From d68194b238228ce470de1fc2453851fc06294739 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 15:45:37 +0200 Subject: [PATCH 077/249] btrfs: sink gfp_t parameter to btrfs_backref_iter_alloc There's only one caller that passes GFP_NOFS, we can drop the parameter an use the flags directly. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/backref.c | 5 ++--- fs/btrfs/backref.h | 3 +-- fs/btrfs/relocation.c | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 41caa6fc6301..f556566b9c79 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2650,12 +2650,11 @@ void free_ipath(struct inode_fs_paths *ipath) kfree(ipath); } -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag) +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info) { struct btrfs_backref_iter *ret; - ret = kzalloc(sizeof(*ret), gfp_flag); + ret = kzalloc(sizeof(*ret), GFP_NOFS); if (!ret) return NULL; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index fa3c9cbf9ae7..2dd68f87dca5 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -156,8 +156,7 @@ struct btrfs_backref_iter { u32 end_ptr; }; -struct btrfs_backref_iter *btrfs_backref_iter_alloc( - struct btrfs_fs_info *fs_info, gfp_t gfp_flag); +struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info); static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter) { diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 3d0a63465a74..77d03dce2521 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -473,7 +473,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree( int ret; int err = 0; - iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS); + iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info); if (!iter) return ERR_PTR(-ENOMEM); path = btrfs_alloc_path(); From e2896e791001b778e3bf8466dd8c6104245c08a2 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 15:55:09 +0200 Subject: [PATCH 078/249] btrfs: sink gfp_t parameter to btrfs_qgroup_trace_extent All callers pass GFP_NOFS, we can drop the parameter and use it directly. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 17 +++++++---------- fs/btrfs/qgroup.h | 2 +- fs/btrfs/tree-log.c | 3 +-- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 39f67ca4b4ca..4786b59035a8 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1842,7 +1842,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, } int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag) + u64 num_bytes) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_qgroup_extent_record *record; @@ -1852,7 +1852,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) || bytenr == 0 || num_bytes == 0) return 0; - record = kzalloc(sizeof(*record), gfp_flag); + record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) return -ENOMEM; @@ -1904,8 +1904,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes, - GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes); if (ret) return ret; } @@ -2104,12 +2103,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans, * blocks for qgroup accounting. */ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + nodesize); if (ret < 0) goto out; - ret = btrfs_qgroup_trace_extent(trans, - dst_path->nodes[dst_level]->start, - nodesize, GFP_NOFS); + ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start, + nodesize); if (ret < 0) goto out; @@ -2393,8 +2391,7 @@ walk_down: path->locks[level] = BTRFS_READ_LOCK; ret = btrfs_qgroup_trace_extent(trans, child_bytenr, - fs_info->nodesize, - GFP_NOFS); + fs_info->nodesize); if (ret) goto out; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 3fb5459c9309..7bffa10589d6 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -321,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * (NULL trans) */ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr, - u64 num_bytes, gfp_t gfp_flag); + u64 num_bytes); /* * Inform qgroup to trace all leaf items of data diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index dc49f0aae1fb..8f377aca3409 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -749,8 +749,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, */ ret = btrfs_qgroup_trace_extent(trans, btrfs_file_extent_disk_bytenr(eb, item), - btrfs_file_extent_disk_num_bytes(eb, item), - GFP_NOFS); + btrfs_file_extent_disk_num_bytes(eb, item)); if (ret < 0) goto out; From fe10158c759cc978d0def69b5cb5efabecb42653 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 15:59:35 +0200 Subject: [PATCH 079/249] btrfs: switch GFP_NOFS to GFP_KERNEL in scrub_setup_recheck_block There's only one caller that calls scrub_setup_recheck_block in the memalloc_nofs_save/_restore protection so it's effectively already GFP_NOFS and it's safe to use GFP_KERNEL. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5c6c4aa2fb09..6e70cc348ae2 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1493,7 +1493,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, return -EIO; } - recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); + recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL); if (!recover) { btrfs_put_bioc(bioc); btrfs_bio_counter_dec(fs_info); @@ -1516,7 +1516,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = alloc_scrub_sector(sblock, logical, GFP_NOFS); + sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; From 02bc392798f9bf8105d50b2fec44b5b2e3c80d47 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 14 Oct 2022 16:00:55 +0200 Subject: [PATCH 080/249] btrfs: sink gfp_t parameter to alloc_scrub_sector All callers pas GFP_KERNEL as parameter so we can use it directly in alloc_scrub_sector. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6e70cc348ae2..3aed760e1a80 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -297,7 +297,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx, * Will also allocate new pages for @sblock if needed. */ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, - u64 logical, gfp_t gfp) + u64 logical) { const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT; struct scrub_sector *ssector; @@ -305,7 +305,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, /* We must never have scrub_block exceed U32_MAX in size. */ ASSERT(logical - sblock->logical < U32_MAX); - ssector = kzalloc(sizeof(*ssector), gfp); + ssector = kzalloc(sizeof(*ssector), GFP_KERNEL); if (!ssector) return NULL; @@ -313,7 +313,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock, if (!sblock->pages[page_index]) { int ret; - sblock->pages[page_index] = alloc_page(gfp); + sblock->pages[page_index] = alloc_page(GFP_KERNEL); if (!sblock->pages[page_index]) { kfree(ssector); return NULL; @@ -1516,7 +1516,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, sblock = sblocks_for_recheck[mirror_index]; sblock->sctx = sctx; - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2438,7 +2438,7 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2769,7 +2769,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity, for (index = 0; len > 0; index++) { struct scrub_sector *sector; - sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL); + sector = alloc_scrub_sector(sblock, logical); if (!sector) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; From 82c0efd3cd5d4ecce028da75d29e3345535b3389 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 19 Oct 2022 21:51:42 +0530 Subject: [PATCH 081/249] btrfs: merge module cleanup sequence to one helper The module exit function exit_btrfs_fs() is duplicating a section of code in init_btrfs_fs(). Add a helper to remove the duplicated code. Due to the init/exit section requirements the function must be inline and not a plain static as it could cause section mismatch. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/super.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c02fcdb9c7c4..75b1c096e4f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2837,7 +2837,7 @@ static const struct init_sequence mod_init_seq[] = { static bool mod_init_result[ARRAY_SIZE(mod_init_seq)]; -static void __exit exit_btrfs_fs(void) +static __always_inline void btrfs_exit_btrfs_fs(void) { int i; @@ -2850,6 +2850,11 @@ static void __exit exit_btrfs_fs(void) } } +static void __exit exit_btrfs_fs(void) +{ + btrfs_exit_btrfs_fs(); +} + static int __init init_btrfs_fs(void) { int ret; @@ -2858,26 +2863,13 @@ static int __init init_btrfs_fs(void) for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) { ASSERT(!mod_init_result[i]); ret = mod_init_seq[i].init_func(); - if (ret < 0) - goto error; + if (ret < 0) { + btrfs_exit_btrfs_fs(); + return ret; + } mod_init_result[i] = true; } return 0; - -error: - /* - * If we call exit_btrfs_fs() it would cause section mismatch. - * As init_btrfs_fs() belongs to .init.text, while exit_btrfs_fs() - * belongs to .exit.text. - */ - for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) { - if (!mod_init_result[i]) - continue; - if (mod_init_seq[i].exit_func) - mod_init_seq[i].exit_func(); - mod_init_result[i] = false; - } - return ret; } late_initcall(init_btrfs_fs); From e43eec81c5167b655b72c781b0e75e62a05e415e Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Thu, 20 Oct 2022 12:58:25 -0400 Subject: [PATCH 082/249] btrfs: use struct qstr instead of name and namelen pairs Many functions throughout btrfs take name buffer and name length arguments. Most of these functions at the highest level are usually called with these arguments extracted from a supplied dentry's name. But the entire name can be passed instead, making each function a little more elegant. Each function whose arguments are currently the name and length extracted from a dentry is herein converted to instead take a pointer to the name in the dentry. The couple of calls to these calls without a struct dentry are converted to create an appropriate qstr to pass in. Additionally, every function which is only called with a name/len extracted directly from a qstr is also converted. This change has positive effect on stack consumption, frame of many functions is reduced but this will be used in the future for fscrypt related structures. Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 26 ++-- fs/btrfs/dir-item.c | 50 ++++---- fs/btrfs/inode-item.c | 73 ++++++----- fs/btrfs/inode-item.h | 20 ++- fs/btrfs/inode.c | 130 +++++++++----------- fs/btrfs/ioctl.c | 7 +- fs/btrfs/root-tree.c | 19 ++- fs/btrfs/send.c | 12 +- fs/btrfs/super.c | 3 +- fs/btrfs/transaction.c | 11 +- fs/btrfs/tree-log.c | 267 +++++++++++++++++++---------------------- fs/btrfs/tree-log.h | 4 +- 12 files changed, 287 insertions(+), 335 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3dcfb62af68a..9e8c9f9bc4fb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1516,11 +1516,11 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len); + u64 ref_id, u64 dirid, u64 sequence, + const struct qstr *name); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len); + u64 ref_id, u64 dirid, u64 *sequence, + const struct qstr *name); int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1549,25 +1549,23 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, + const struct qstr *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct qstr *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const char *name, int name_len, - int mod); + const struct qstr *name, int mod); struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod); + u64 index, const struct qstr *name, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, - const char *name, int name_len); + const struct qstr *name); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1648,10 +1646,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len); + const struct qstr *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index); + const struct qstr *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 9fa37f245c43..48a15af4ec57 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -105,8 +105,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * to use for the second index (if one is created). * Will return 0 or -ENOMEM */ -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, - int name_len, struct btrfs_inode *dir, +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct qstr *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -122,7 +122,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, key.objectid = btrfs_ino(dir); key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -130,9 +130,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, btrfs_cpu_key_to_disk(&disk_key, location); - data_size = sizeof(*dir_item) + name_len; + data_size = sizeof(*dir_item) + name->len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, - name, name_len); + name->name, name->len); if (IS_ERR(dir_item)) { ret = PTR_ERR(dir_item); if (ret == -EEXIST) @@ -144,11 +144,11 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name, btrfs_set_dir_item_key(leaf, dir_item, &disk_key); btrfs_set_dir_type(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); - btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); name_ptr = (unsigned long)(dir_item + 1); - write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, name->name, name_ptr, name->len); btrfs_mark_buffer_dirty(leaf); second_insert: @@ -159,7 +159,7 @@ second_insert: } btrfs_release_path(path); - ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir, + ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir, &disk_key, type, index); out_free: btrfs_free_path(path); @@ -208,7 +208,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const char *name, int name_len, + const struct qstr *name, int mod) { struct btrfs_key key; @@ -216,9 +216,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (IS_ERR(di) && PTR_ERR(di) == -ENOENT) return NULL; @@ -226,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const char *name, int name_len) + const struct qstr *name) { int ret; struct btrfs_key key; @@ -242,9 +243,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, key.objectid = dir; key.type = BTRFS_DIR_ITEM_KEY; - key.offset = btrfs_name_hash(name, name_len); + key.offset = btrfs_name_hash(name->name, name->len); - di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0); + di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name, + name->len, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ @@ -264,11 +266,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, goto out; } - /* - * see if there is room in the item to insert this - * name - */ - data_size = sizeof(*di) + name_len; + /* See if there is room in the item to insert this name. */ + data_size = sizeof(*di) + name->len; leaf = path->nodes[0]; slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + @@ -305,8 +304,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const char *name, int name_len, - int mod) + u64 index, const struct qstr *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -315,7 +313,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, key.type = BTRFS_DIR_INDEX_KEY; key.offset = index; - di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod); + di = btrfs_lookup_match_dir(trans, root, path, &key, name->name, + name->len, mod); if (di == ERR_PTR(-ENOENT)) return NULL; @@ -323,9 +322,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, } struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const char *name, int name_len) +btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, + u64 dirid, const struct qstr *name) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -340,7 +338,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root, break; di = btrfs_match_dir_item_name(root->fs_info, path, - name, name_len); + name->name, name->len); if (di) return di; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index b8dbabfa8b31..577e39cfc411 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -14,8 +14,8 @@ #include "accessors.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len) + int slot, + const struct qstr *name) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -31,9 +31,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, len = btrfs_inode_ref_name_len(leaf, ref); name_ptr = (unsigned long)(ref + 1); cur_offset += len + sizeof(*ref); - if (len != name_len) + if (len != name->len) continue; - if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + if (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0) return ref; } return NULL; @@ -41,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len) + const struct qstr *name) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -64,9 +65,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( name_ptr = (unsigned long)(&extref->name); ref_name_len = btrfs_inode_extref_name_len(leaf, extref); - if (ref_name_len == name_len && + if (ref_name_len == name->len && btrfs_inode_extref_parent(leaf, extref) == ref_objectid && - (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) + (memcmp_extent_buffer(leaf, name->name, name_ptr, + name->len) == 0)) return extref; cur_offset += ref_name_len + sizeof(*extref); @@ -79,7 +81,7 @@ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct qstr *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { @@ -88,7 +90,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) @@ -96,13 +98,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, if (ret > 0) return NULL; return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); } static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { @@ -111,14 +113,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_inode_extref *extref; struct extent_buffer *leaf; int ret; - int del_len = name_len + sizeof(*extref); + int del_len = name->len + sizeof(*extref); unsigned long ptr; unsigned long item_start; u32 item_size; key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -136,7 +138,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, * readonly. */ extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, name, name_len); + ref_objectid, name); if (!extref) { btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL); ret = -EROFS; @@ -172,8 +174,7 @@ out: } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; @@ -186,7 +187,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, u32 sub_item_len; int ret; int search_ext_refs = 0; - int del_len = name_len + sizeof(*ref); + int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -205,8 +206,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } - ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name, - name_len); + ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name); if (!ref) { ret = -ENOENT; search_ext_refs = 1; @@ -223,7 +223,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, goto out; } ptr = (unsigned long)ref; - sub_item_len = name_len + sizeof(*ref); + sub_item_len = name->len + sizeof(*ref); item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, item_size - (ptr + sub_item_len - item_start)); @@ -237,7 +237,7 @@ out: * name in our ref array. Find and remove the extended * inode ref then. */ - return btrfs_del_inode_extref(trans, root, name, name_len, + return btrfs_del_inode_extref(trans, root, name, inode_objectid, ref_objectid, index); } @@ -251,12 +251,13 @@ out: */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 index) + const struct qstr *name, + u64 inode_objectid, u64 ref_objectid, + u64 index) { struct btrfs_inode_extref *extref; int ret; - int ins_len = name_len + sizeof(*extref); + int ins_len = name->len + sizeof(*extref); unsigned long ptr; struct btrfs_path *path; struct btrfs_key key; @@ -264,7 +265,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = btrfs_extref_hash(ref_objectid, name, name_len); + key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len); path = btrfs_alloc_path(); if (!path) @@ -276,7 +277,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, if (btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], ref_objectid, - name, name_len)) + name)) goto out; btrfs_extend_item(path, ins_len); @@ -290,12 +291,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len; extref = (struct btrfs_inode_extref *)ptr; - btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len); + btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len); btrfs_set_inode_extref_index(path->nodes[0], extref, index); btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid); ptr = (unsigned long)&extref->name; - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -305,8 +306,7 @@ out: /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -315,7 +315,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode_ref *ref; unsigned long ptr; int ret; - int ins_len = name_len + sizeof(*ref); + int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; key.offset = ref_objectid; @@ -331,7 +331,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EEXIST) { u32 old_size; ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len); + name); if (ref) goto out; @@ -340,7 +340,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); ret = 0; @@ -348,7 +348,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, if (ret == -EOVERFLOW) { if (btrfs_find_name_in_backref(path->nodes[0], path->slots[0], - name, name_len)) + name)) ret = -EEXIST; else ret = -EMLINK; @@ -357,11 +357,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, } else { ref = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, index); ptr = (unsigned long)(ref + 1); } - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, name->len); btrfs_mark_buffer_dirty(path->nodes[0]); out: @@ -374,7 +374,6 @@ out: if (btrfs_super_incompat_flags(disk_super) & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) ret = btrfs_insert_inode_extref(trans, root, name, - name_len, inode_objectid, ref_objectid, index); } diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index a8fc16d0147f..3c657c670cfd 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, + struct btrfs_root *root, const struct qstr *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - const char *name, int name_len, - u64 inode_objectid, u64 ref_objectid, u64 *index); + struct btrfs_root *root, const struct qstr *name, + u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); -int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root - *root, struct btrfs_path *path, +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *location, int mod); struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const char *name, int name_len, + const struct qstr *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, - int slot, const char *name, - int name_len); + int slot, + const struct qstr *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const char *name, int name_len); + const struct qstr *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb2a6d7f6252..3162d8f0aaf8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3641,7 +3641,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->delayed_iput_lock); } -/** +/* * Wait for flushing all delayed iputs * * @fs_info: the filesystem @@ -4286,7 +4286,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len, + const struct qstr *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; @@ -4304,8 +4304,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, goto out; } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto err; @@ -4333,12 +4332,11 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, } } - ret = btrfs_del_inode_ref(trans, root, name, name_len, ino, - dir_ino, &index); + ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index); if (ret) { btrfs_info(fs_info, "failed to delete reference to %.*s, inode %llu parent %llu", - name_len, name, ino, dir_ino); + name->len, name->name, ino, dir_ino); btrfs_abort_transaction(trans, ret); goto err; } @@ -4359,10 +4357,8 @@ skip_backref: * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { - btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode, - dir_ino); - btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir, - index); + btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino); + btrfs_del_dir_entries_in_log(trans, root, name, dir, index); } /* @@ -4380,7 +4376,7 @@ err: if (ret) goto out; - btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); @@ -4393,10 +4389,11 @@ out: int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, int name_len) + const struct qstr *name) { int ret; - ret = __btrfs_unlink_inode(trans, dir, inode, name, name_len, NULL); + + ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL); if (!ret) { drop_nlink(&inode->vfs_inode); ret = btrfs_update_inode(trans, inode->root, inode); @@ -4440,9 +4437,8 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); - ret = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &dentry->d_name); if (ret) goto out; @@ -4467,8 +4463,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - const char *name = dentry->d_name.name; - int name_len = dentry->d_name.len; + const struct qstr *name = &dentry->d_name; u64 index; int ret; u64 objectid; @@ -4487,8 +4482,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, - name, name_len, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4514,8 +4508,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, - name, name_len); + di = btrfs_search_dir_index_item(root, path, dir_ino, name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4532,7 +4525,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, name, name_len); + &index, name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4545,7 +4538,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name->len * 2); inode_inc_iversion(dir); dir->i_mtime = current_time(dir); dir->i_ctime = dir->i_mtime; @@ -4567,6 +4560,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; + struct qstr name = QSTR_INIT("default", 7); u64 dir_id; int ret; @@ -4577,7 +4571,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) /* Make sure this root isn't set as the default subvol */ dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, - dir_id, "default", 7, 0); + dir_id, &name, 0); if (di && !IS_ERR(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == root->root_key.objectid) { @@ -4844,9 +4838,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) last_unlink_trans = BTRFS_I(inode)->last_unlink_trans; /* now the directory is empty */ - err = btrfs_unlink_inode(trans, BTRFS_I(dir), - BTRFS_I(d_inode(dentry)), dentry->d_name.name, - dentry->d_name.len); + err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), + &dentry->d_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -5538,8 +5531,7 @@ no_delete: static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const struct qstr *name = &dentry->d_name; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -5550,7 +5542,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, return -ENOMEM; di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - name, namelen, 0); + name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5562,7 +5554,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name, btrfs_ino(BTRFS_I(dir)), + __func__, name->name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) @@ -6321,8 +6313,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, { struct inode *dir = args->dir; struct inode *inode = args->inode; - const char *name = args->orphan ? NULL : args->dentry->d_name.name; - int name_len = args->orphan ? 0 : args->dentry->d_name.len; + const struct qstr *name = args->orphan ? NULL : &args->dentry->d_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; @@ -6423,7 +6414,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, sizes[1] = 2 + sizeof(*ref); } else { key[1].offset = btrfs_ino(BTRFS_I(dir)); - sizes[1] = name_len + sizeof(*ref); + sizes[1] = name->len + sizeof(*ref); } } @@ -6462,10 +6453,12 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_ref_index(path->nodes[0], ref, 0); write_extent_buffer(path->nodes[0], "..", ptr, 2); } else { - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, + name->len); btrfs_set_inode_ref_index(path->nodes[0], ref, BTRFS_I(inode)->dir_index); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + write_extent_buffer(path->nodes[0], name->name, ptr, + name->len); } } @@ -6526,7 +6519,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, ret = btrfs_orphan_add(trans, BTRFS_I(inode)); } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 0, BTRFS_I(inode)->dir_index); + 0, BTRFS_I(inode)->dir_index); } if (ret) { btrfs_abort_transaction(trans, ret); @@ -6555,7 +6548,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const char *name, int name_len, int add_backref, u64 index) + const struct qstr *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6574,17 +6567,17 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) { ret = btrfs_add_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - index, name, name_len); + index, name); } else if (add_backref) { - ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino, - parent_ino, index); + ret = btrfs_insert_inode_ref(trans, root, name, + ino, parent_ino, index); } /* Nothing to clean up yet */ if (ret) return ret; - ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key, + ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, btrfs_inode_type(&inode->vfs_inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; @@ -6594,7 +6587,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, } btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + - name_len * 2); + name->len * 2); inode_inc_iversion(&parent_inode->vfs_inode); /* * If we are replaying a log tree, we do not want to update the mtime @@ -6619,15 +6612,15 @@ fail_dir_item: int err; err = btrfs_del_root_ref(trans, key.objectid, root->root_key.objectid, parent_ino, - &local_index, name, name_len); + &local_index, name); if (err) btrfs_abort_transaction(trans, err); } else if (add_backref) { u64 local_index; int err; - err = btrfs_del_inode_ref(trans, root, name, name_len, - ino, parent_ino, &local_index); + err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, + &local_index); if (err) btrfs_abort_transaction(trans, err); } @@ -6747,7 +6740,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, dentry->d_name.len, 1, index); + &dentry->d_name, 1, index); if (err) { drop_inode = 1; @@ -9088,9 +9081,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, + ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); @@ -9104,9 +9095,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, - old_dentry->d_name.name, - old_dentry->d_name.len, + ret = btrfs_insert_inode_ref(trans, root, &old_dentry->d_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); @@ -9142,8 +9131,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, + &old_dentry->d_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); @@ -9159,8 +9147,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, + &new_dentry->d_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); @@ -9171,16 +9158,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, old_idx); + &new_dentry->d_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - old_dentry->d_name.name, - old_dentry->d_name.len, 0, new_idx); + &old_dentry->d_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9281,8 +9266,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* check for collisions, even if the name isn't there */ ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, - new_dentry->d_name.name, - new_dentry->d_name.len); + &new_dentry->d_name); if (ret) { if (ret == -EEXIST) { @@ -9376,9 +9360,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, - new_dentry->d_name.name, - new_dentry->d_name.len, + ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) @@ -9402,10 +9384,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns, ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), - BTRFS_I(d_inode(old_dentry)), - old_dentry->d_name.name, - old_dentry->d_name.len, - &rename_ctx); + BTRFS_I(d_inode(old_dentry)), + &old_dentry->d_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9424,8 +9404,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - new_dentry->d_name.name, - new_dentry->d_name.len); + &new_dentry->d_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9437,8 +9416,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_dentry->d_name.name, - new_dentry->d_name.len, 0, index); + &new_dentry->d_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e33be4032fff..fde55d0f61b4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -951,6 +951,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; + struct qstr name_str = QSTR_INIT(name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); @@ -971,8 +972,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, * check for them now when we can safely fail */ error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root, - dir->i_ino, name, - namelen); + dir->i_ino, &name_str); if (error) goto out_dput; @@ -3779,6 +3779,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; + struct qstr name = QSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; @@ -3822,7 +3823,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path, - dir_id, "default", 7, 1); + dir_id, &name, 1); if (IS_ERR_OR_NULL(di)) { btrfs_release_path(path); btrfs_end_transaction(trans); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index fc00dfb281d5..848a720747af 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -330,9 +330,8 @@ out: } int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, const char *name, - int name_len) - + u64 ref_id, u64 dirid, u64 *sequence, + const struct qstr *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; @@ -359,8 +358,8 @@ again: struct btrfs_root_ref); ptr = (unsigned long)(ref + 1); if ((btrfs_root_ref_dirid(leaf, ref) != dirid) || - (btrfs_root_ref_name_len(leaf, ref) != name_len) || - memcmp_extent_buffer(leaf, name, ptr, name_len)) { + (btrfs_root_ref_name_len(leaf, ref) != name->len) || + memcmp_extent_buffer(leaf, name->name, ptr, name->len)) { ret = -ENOENT; goto out; } @@ -403,8 +402,8 @@ out: * Will return 0, -ENOMEM, or anything from the CoW path */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, const char *name, - int name_len) + u64 ref_id, u64 dirid, u64 sequence, + const struct qstr *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; @@ -423,7 +422,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, key.offset = ref_id; again: ret = btrfs_insert_empty_item(trans, tree_root, path, &key, - sizeof(*ref) + name_len); + sizeof(*ref) + name->len); if (ret) { btrfs_abort_transaction(trans, ret); btrfs_free_path(path); @@ -434,9 +433,9 @@ again: ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); btrfs_set_root_ref_dirid(leaf, ref, dirid); btrfs_set_root_ref_sequence(leaf, ref, sequence); - btrfs_set_root_ref_name_len(leaf, ref, name_len); + btrfs_set_root_ref_name_len(leaf, ref, name->len); ptr = (unsigned long)(ref + 1); - write_extent_buffer(leaf, name, ptr, name_len); + write_extent_buffer(leaf, name->name, ptr, name->len); btrfs_mark_buffer_dirty(leaf); if (key.type == BTRFS_ROOT_BACKREF_KEY) { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 31fb3292e816..d0feeea2ba1c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1597,13 +1597,17 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { + struct qstr tmp_name; + len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); ASSERT(len < sizeof(tmp)); + tmp_name.name = tmp; + tmp_name.len = strlen(tmp); di = btrfs_lookup_dir_item(NULL, sctx->send_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1623,7 +1627,7 @@ static int gen_unique_name(struct send_ctx *sctx, di = btrfs_lookup_dir_item(NULL, sctx->parent_root, path, BTRFS_FIRST_FREE_OBJECTID, - tmp, strlen(tmp), 0); + &tmp_name, 0); btrfs_release_path(path); if (IS_ERR(di)) { ret = PTR_ERR(di); @@ -1753,13 +1757,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root, struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; + struct qstr name_str = QSTR_INIT(name, name_len); path = alloc_path_for_send(); if (!path) return -ENOMEM; - di = btrfs_lookup_dir_item(NULL, root, path, - dir, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 75b1c096e4f1..13d33e5707de 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1423,6 +1423,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_key location; + struct qstr name = QSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); @@ -1435,7 +1436,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec * to mount. */ dir_id = btrfs_super_root_dir(fs_info->super_copy); - di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0); if (IS_ERR(di)) { btrfs_free_path(path); return PTR_ERR(di); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 25e6b504edb4..7db9612f4d0e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1678,8 +1678,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - dentry->d_name.name, - dentry->d_name.len, 0); + &dentry->d_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1774,7 +1773,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - dentry->d_name.name, dentry->d_name.len); + &dentry->d_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1806,9 +1805,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, dentry->d_name.name, - dentry->d_name.len, BTRFS_I(parent_inode), - &key, BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, &dentry->d_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8f377aca3409..b14ac7017f41 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -597,6 +597,21 @@ static int overwrite_item(struct btrfs_trans_handle *trans, return do_overwrite_item(trans, root, path, eb, slot, key); } +static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, + struct qstr *name) +{ + char *buf; + + buf = kmalloc(len, GFP_NOFS); + if (!buf) + return -ENOMEM; + + read_extent_buffer(eb, buf, (unsigned long)start, len); + name->name = buf; + name->len = len; + return 0; +} + /* * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log @@ -902,12 +917,11 @@ out: static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const char *name, - int name_len) + const struct qstr *name) { int ret; - ret = btrfs_unlink_inode(trans, dir, inode, name, name_len); + ret = btrfs_unlink_inode(trans, dir, inode, name); if (ret) return ret; /* @@ -934,8 +948,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct inode *inode; - char *name; - int name_len; + struct qstr name; struct extent_buffer *leaf; struct btrfs_key location; int ret; @@ -943,12 +956,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_dir_item_key_to_cpu(leaf, di, &location); - name_len = btrfs_dir_name_len(leaf, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) + ret = read_alloc_one_name(leaf, di + 1, btrfs_dir_name_len(leaf, di), &name); + if (ret) return -ENOMEM; - read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); btrfs_release_path(path); inode = read_one_inode(root, location.objectid); @@ -961,10 +972,9 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), name, - name_len); + ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); out: - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -979,14 +989,14 @@ out: static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, - const char *name, int name_len) + struct qstr *name) { struct btrfs_dir_item *di; struct btrfs_key location; int ret = 0; di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, - index, name, name_len, 0); + index, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -999,7 +1009,7 @@ static noinline int inode_in_dir(struct btrfs_root *root, } btrfs_release_path(path); - di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, 0); if (IS_ERR(di)) { ret = PTR_ERR(di); goto out; @@ -1026,7 +1036,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - const char *name, int namelen) + const struct qstr *name) { struct btrfs_path *path; int ret; @@ -1046,12 +1056,10 @@ static noinline int backref_in_log(struct btrfs_root *log, if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0], - ref_objectid, - name, namelen); + ref_objectid, name); else ret = !!btrfs_find_name_in_backref(path->nodes[0], - path->slots[0], - name, namelen); + path->slots[0], name); out: btrfs_free_path(path); return ret; @@ -1064,11 +1072,9 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, char *name, int namelen) + u64 ref_index, struct qstr *name) { int ret; - char *victim_name; - int victim_name_len; struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key search_key; @@ -1100,43 +1106,40 @@ again: ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { - victim_ref = (struct btrfs_inode_ref *)ptr; - victim_name_len = btrfs_inode_ref_name_len(leaf, - victim_ref); - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; + struct qstr victim_name; - read_extent_buffer(leaf, victim_name, - (unsigned long)(victim_ref + 1), - victim_name_len); + victim_ref = (struct btrfs_inode_ref *)ptr; + ret = read_alloc_one_name(leaf, (victim_ref + 1), + btrfs_inode_ref_name_len(leaf, victim_ref), + &victim_name); + if (ret) + return ret; ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { inc_nlink(&inode->vfs_inode); btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, dir, inode, - victim_name, victim_name_len); - kfree(victim_name); + &victim_name); + kfree(victim_name.name); if (ret) return ret; goto again; } - kfree(victim_name); + kfree(victim_name.name); - ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + ptr = (unsigned long)(victim_ref + 1) + victim_name.len; } } btrfs_release_path(path); /* Same search but for extended refs */ - extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen, + extref = btrfs_lookup_inode_extref(NULL, root, path, name, inode_objectid, parent_objectid, 0, 0); if (IS_ERR(extref)) { @@ -1153,29 +1156,28 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - extref = (struct btrfs_inode_extref *)(base + cur_offset); + struct qstr victim_name; - victim_name_len = btrfs_inode_extref_name_len(leaf, extref); + extref = (struct btrfs_inode_extref *)(base + cur_offset); if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid) goto next; - victim_name = kmalloc(victim_name_len, GFP_NOFS); - if (!victim_name) - return -ENOMEM; - read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name, - victim_name_len); + ret = read_alloc_one_name(leaf, &extref->name, + btrfs_inode_extref_name_len(leaf, extref), + &victim_name); + if (ret) + return ret; search_key.objectid = inode_objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = btrfs_extref_hash(parent_objectid, - victim_name, - victim_name_len); + victim_name.name, + victim_name.len); ret = backref_in_log(log_root, &search_key, - parent_objectid, victim_name, - victim_name_len); + parent_objectid, &victim_name); if (ret < 0) { - kfree(victim_name); + kfree(victim_name.name); return ret; } else if (!ret) { ret = -ENOENT; @@ -1187,26 +1189,24 @@ again: ret = unlink_inode_for_log_replay(trans, BTRFS_I(victim_parent), - inode, - victim_name, - victim_name_len); + inode, &victim_name); } iput(victim_parent); - kfree(victim_name); + kfree(victim_name.name); if (ret) return ret; goto again; } - kfree(victim_name); + kfree(victim_name.name); next: - cur_offset += victim_name_len + sizeof(*extref); + cur_offset += victim_name.len + sizeof(*extref); } } btrfs_release_path(path); /* look for a conflicting sequence number */ di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir), - ref_index, name, namelen, 0); + ref_index, name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1217,8 +1217,7 @@ next: btrfs_release_path(path); /* look for a conflicting name */ - di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), - name, namelen, 0); + di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir), name, 0); if (IS_ERR(di)) { return PTR_ERR(di); } else if (di) { @@ -1232,20 +1231,18 @@ next: } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index, + struct qstr *name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; + int ret; extref = (struct btrfs_inode_extref *)ref_ptr; - *namelen = btrfs_inode_extref_name_len(eb, extref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)&extref->name, - *namelen); + ret = read_alloc_one_name(eb, &extref->name, + btrfs_inode_extref_name_len(eb, extref), name); + if (ret) + return ret; if (index) *index = btrfs_inode_extref_index(eb, extref); @@ -1256,18 +1253,17 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - u32 *namelen, char **name, u64 *index) + struct qstr *name, u64 *index) { struct btrfs_inode_ref *ref; + int ret; ref = (struct btrfs_inode_ref *)ref_ptr; - *namelen = btrfs_inode_ref_name_len(eb, ref); - *name = kmalloc(*namelen, GFP_NOFS); - if (*name == NULL) - return -ENOMEM; - - read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen); + ret = read_alloc_one_name(eb, ref + 1, btrfs_inode_ref_name_len(eb, ref), + name); + if (ret) + return ret; if (index) *index = btrfs_inode_ref_index(eb, ref); @@ -1309,28 +1305,24 @@ again: ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { - char *name = NULL; - int namelen; + struct qstr name; u64 parent_id; if (key->type == BTRFS_INODE_EXTREF_KEY) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, NULL, &parent_id); } else { parent_id = key->offset; - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - NULL); + ret = ref_get_fields(eb, ref_ptr, &name, NULL); } if (ret) goto out; if (key->type == BTRFS_INODE_EXTREF_KEY) ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot, - parent_id, name, - namelen); + parent_id, &name); else - ret = !!btrfs_find_name_in_backref(log_eb, log_slot, - name, namelen); + ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { struct inode *dir; @@ -1339,20 +1331,20 @@ again: dir = read_one_inode(root, parent_id); if (!dir) { ret = -ENOENT; - kfree(name); + kfree(name.name); goto out; } ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, name, namelen); - kfree(name); + inode, &name); + kfree(name.name); iput(dir); if (ret) goto out; goto again; } - kfree(name); - ref_ptr += namelen; + kfree(name.name); + ref_ptr += name.len; if (key->type == BTRFS_INODE_EXTREF_KEY) ref_ptr += sizeof(struct btrfs_inode_extref); else @@ -1381,8 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - char *name = NULL; - int namelen; + struct qstr name; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1426,7 +1417,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, while (ref_ptr < ref_end) { if (log_ref_ver) { - ret = extref_get_fields(eb, ref_ptr, &namelen, &name, + ret = extref_get_fields(eb, ref_ptr, &name, &ref_index, &parent_objectid); /* * parent object can change from one array @@ -1439,15 +1430,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, goto out; } } else { - ret = ref_get_fields(eb, ref_ptr, &namelen, &name, - &ref_index); + ret = ref_get_fields(eb, ref_ptr, &name, &ref_index); } if (ret) goto out; ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, - name, namelen); + btrfs_ino(BTRFS_I(inode)), ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1461,7 +1450,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = __add_inode_ref(trans, root, path, log, BTRFS_I(dir), BTRFS_I(inode), inode_objectid, parent_objectid, - ref_index, name, namelen); + ref_index, &name); if (ret) { if (ret == 1) ret = 0; @@ -1470,7 +1459,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, /* insert our name */ ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - name, namelen, 0, ref_index); + &name, 0, ref_index); if (ret) goto out; @@ -1480,9 +1469,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* Else, ret == 1, we already have a perfect match, we're done. */ - ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; - kfree(name); - name = NULL; + ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + name.len; + kfree(name.name); + name.name = NULL; if (log_ref_ver) { iput(dir); dir = NULL; @@ -1506,7 +1495,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, ret = overwrite_item(trans, root, path, eb, slot, key); out: btrfs_release_path(path); - kfree(name); + kfree(name.name); iput(dir); iput(inode); return ret; @@ -1778,7 +1767,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, - char *name, int name_len, + const struct qstr *name, struct btrfs_key *location) { struct inode *inode; @@ -1796,7 +1785,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, } ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - name_len, 1, index); + 1, index); /* FIXME, put inode into FIXUP list */ @@ -1856,8 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - char *name; - int name_len; + struct qstr name; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -1875,17 +1863,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (!dir) return -EIO; - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } log_type = btrfs_dir_type(eb, di); - read_extent_buffer(eb, name, (unsigned long)(di + 1), - name_len); - btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); @@ -1895,7 +1877,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = 0; dir_dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, - name, name_len, 1); + &name, 1); if (IS_ERR(dir_dst_di)) { ret = PTR_ERR(dir_dst_di); goto out; @@ -1912,7 +1894,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, index_dst_di = btrfs_lookup_dir_index_item(trans, root, path, key->objectid, key->offset, - name, name_len, 1); + &name, 1); if (IS_ERR(index_dst_di)) { ret = PTR_ERR(index_dst_di); goto out; @@ -1940,7 +1922,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_REF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, 0, name, name_len); + ret = backref_in_log(root->log_root, &search_key, 0, &name); if (ret < 0) { goto out; } else if (ret) { @@ -1953,8 +1935,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, search_key.objectid = log_key.objectid; search_key.type = BTRFS_INODE_EXTREF_KEY; search_key.offset = key->objectid; - ret = backref_in_log(root->log_root, &search_key, key->objectid, name, - name_len); + ret = backref_in_log(root->log_root, &search_key, key->objectid, &name); if (ret < 0) { goto out; } else if (ret) { @@ -1965,7 +1946,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } btrfs_release_path(path); ret = insert_one_name(trans, root, key->objectid, key->offset, - name, name_len, &log_key); + &name, &log_key); if (ret && ret != -ENOENT && ret != -EEXIST) goto out; if (!ret) @@ -1975,10 +1956,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); } - kfree(name); + kfree(name.name); iput(dir); if (!ret && name_added) ret = 1; @@ -2144,8 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - int name_len; - char *name; + struct qstr name; struct inode *inode = NULL; struct btrfs_key location; @@ -2160,22 +2140,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, eb = path->nodes[0]; slot = path->slots[0]; di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); - name_len = btrfs_dir_name_len(eb, di); - name = kmalloc(name_len, GFP_NOFS); - if (!name) { - ret = -ENOMEM; + ret = read_alloc_one_name(eb, di + 1, btrfs_dir_name_len(eb, di), &name); + if (ret) goto out; - } - - read_extent_buffer(eb, name, (unsigned long)(di + 1), name_len); if (log) { struct btrfs_dir_item *log_di; log_di = btrfs_lookup_dir_index_item(trans, log, log_path, dir_key->objectid, - dir_key->offset, - name, name_len, 0); + dir_key->offset, &name, 0); if (IS_ERR(log_di)) { ret = PTR_ERR(log_di); goto out; @@ -2201,7 +2175,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, inc_nlink(inode); ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - name, name_len); + &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2210,7 +2184,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_release_path(log_path); - kfree(name); + kfree(name.name); iput(inode); return ret; } @@ -3449,7 +3423,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, - const char *name, int name_len, + const struct qstr *name, u64 index) { struct btrfs_dir_item *di; @@ -3459,7 +3433,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, * for dir item keys. */ di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino, - index, name, name_len, -1); + index, name, -1); if (IS_ERR(di)) return PTR_ERR(di); else if (!di) @@ -3496,7 +3470,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *dir, u64 index) { struct btrfs_path *path; @@ -3523,7 +3497,7 @@ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, } ret = del_logged_dentry(trans, root->log_root, path, btrfs_ino(dir), - name, name_len, index); + name, index); btrfs_free_path(path); out_unlock: mutex_unlock(&dir->log_mutex); @@ -3535,7 +3509,7 @@ out_unlock: /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; @@ -3556,7 +3530,7 @@ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, log = root->log_root; mutex_lock(&inode->log_mutex); - ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode), + ret = btrfs_del_inode_ref(trans, log, name, btrfs_ino(inode), dirid, &index); mutex_unlock(&inode->log_mutex); if (ret < 0 && ret != -ENOENT) @@ -5270,6 +5244,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u32 this_len; unsigned long name_ptr; struct btrfs_dir_item *di; + struct qstr name_str; if (key->type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -5303,8 +5278,11 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, } read_extent_buffer(eb, name, name_ptr, this_name_len); + + name_str.name = name; + name_str.len = this_name_len; di = btrfs_lookup_dir_item(NULL, inode->root, search_path, - parent, name, this_name_len, 0); + parent, &name_str, 0); if (di && !IS_ERR(di)) { struct btrfs_key di_key; @@ -7505,8 +7483,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - old_dentry->d_name.name, - old_dentry->d_name.len, old_dir_index); + &old_dentry->d_name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index f5770829d075..8c9a387151b0 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -87,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const char *name, int name_len, + const struct qstr *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); From ab3c5c18e8fa3f8ea116016095d25adab466cd39 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Thu, 20 Oct 2022 12:58:26 -0400 Subject: [PATCH 083/249] btrfs: setup qstr from dentrys using fscrypt helper Most places where we get a struct qstr, we are doing so from a dentry. With fscrypt, the dentry's name may be encrypted on-disk, so fscrypt provides a helper to convert a dentry name to the appropriate disk name if necessary. Convert each of the dentry name accesses to use fscrypt_setup_filename(), then convert the resulting fscrypt_name back to an unencrypted qstr. This does not work for nokey names, but the specific locations that could spawn nokey names are noted. At present, since there are no encrypted directories, nothing goes down the filename encryption paths. Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 + fs/btrfs/inode.c | 192 +++++++++++++++++++++++++++++++---------- fs/btrfs/transaction.c | 40 ++++++--- fs/btrfs/tree-log.c | 11 ++- 4 files changed, 189 insertions(+), 57 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9e8c9f9bc4fb..b2016c14488b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -28,6 +28,7 @@ #include #include #include +#include #include "extent-io-tree.h" #include "extent_io.h" #include "extent_map.h" @@ -1674,6 +1675,8 @@ struct btrfs_new_inode_args { */ struct posix_acl *default_acl; struct posix_acl *acl; + struct fscrypt_name fname; + struct qstr name; }; int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3162d8f0aaf8..fa4b101bc1eb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4429,28 +4429,41 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; struct inode *inode = d_inode(dentry); int ret; + struct fscrypt_name fname; + struct qstr name; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto fscrypt_free; + } btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), 0); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &dentry->d_name); + &name); if (ret) - goto out; + goto end_trans; if (inode->i_nlink == 0) { ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) - goto out; + goto end_trans; } -out: +end_trans: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info); +fscrypt_free: + fscrypt_free_filename(&fname); return ret; } @@ -4463,11 +4476,19 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - const struct qstr *name = &dentry->d_name; + struct qstr name; u64 index; int ret; u64 objectid; u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + struct fscrypt_name fname; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + return ret; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) { objectid = inode->root->root_key.objectid; @@ -4475,14 +4496,17 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, objectid = inode->location.objectid; } else { WARN_ON(1); + fscrypt_free_filename(&fname); return -EINVAL; } path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; + if (!path) { + ret = -ENOMEM; + goto out; + } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4508,7 +4532,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, name); + di = btrfs_search_dir_index_item(root, path, dir_ino, &name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4525,7 +4549,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, name); + &index, &name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4538,7 +4562,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name->len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name.len * 2); inode_inc_iversion(dir); dir->i_mtime = current_time(dir); dir->i_ctime = dir->i_mtime; @@ -4547,6 +4571,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return ret; } @@ -4810,6 +4835,8 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) int err = 0; struct btrfs_trans_handle *trans; u64 last_unlink_trans; + struct fscrypt_name fname; + struct qstr name; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4822,9 +4849,18 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return btrfs_delete_subvolume(dir, dentry); } + err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (err) + return err; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ + trans = __unlink_start_trans(dir); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_notrans; + } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { err = btrfs_unlink_subvol(trans, dir, dentry); @@ -4839,7 +4875,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) /* now the directory is empty */ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &dentry->d_name); + &name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -4858,7 +4894,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } out: btrfs_end_transaction(trans); +out_notrans: btrfs_btree_balance_dirty(fs_info); + fscrypt_free_filename(&fname); return err; } @@ -5531,18 +5569,27 @@ no_delete: static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - const struct qstr *name = &dentry->d_name; + struct qstr name; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; int ret = 0; + struct fscrypt_name fname; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + if (ret) + goto out; + + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + + /* This needs to handle no-key deletions later on */ + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - name, 0); + &name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5554,12 +5601,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name->name, btrfs_ino(BTRFS_I(dir)), + __func__, name.name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) *type = btrfs_dir_type(path->nodes[0], di); out: + fscrypt_free_filename(&fname); btrfs_free_path(path); return ret; } @@ -5582,6 +5630,14 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key key; int ret; int err = 0; + struct fscrypt_name fname; + struct qstr name; + + ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (ret) + return ret; + + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); path = btrfs_alloc_path(); if (!path) { @@ -5604,12 +5660,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || - btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len) + btrfs_root_ref_name_len(leaf, ref) != name.len) goto out; - ret = memcmp_extent_buffer(leaf, dentry->d_name.name, - (unsigned long)(ref + 1), - dentry->d_name.len); + ret = memcmp_extent_buffer(leaf, name.name, (unsigned long)(ref + 1), + name.len); if (ret) goto out; @@ -5628,6 +5683,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, err = 0; out: btrfs_free_path(path); + fscrypt_free_filename(&fname); return err; } @@ -6236,9 +6292,19 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, struct inode *inode = args->inode; int ret; + if (!args->orphan) { + ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0, + &args->fname); + if (ret) + return ret; + args->name = (struct qstr)FSTR_TO_QSTR(&args->fname.disk_name); + } + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); - if (ret) + if (ret) { + fscrypt_free_filename(&args->fname); return ret; + } /* 1 to add inode item */ *trans_num_items = 1; @@ -6278,6 +6344,7 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) { posix_acl_release(args->acl); posix_acl_release(args->default_acl); + fscrypt_free_filename(&args->fname); } /* @@ -6703,6 +6770,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct fscrypt_name fname; + struct qstr name; u64 index; int err; int drop_inode = 0; @@ -6714,6 +6783,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink >= BTRFS_LINK_MAX) return -EMLINK; + err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + if (err) + goto fail; + + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6740,7 +6815,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &dentry->d_name, 1, index); + &name, 1, index); if (err) { drop_inode = 1; @@ -6764,6 +6839,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, } fail: + fscrypt_free_filename(&fname); if (trans) btrfs_end_transaction(trans); if (drop_inode) { @@ -9003,6 +9079,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret; int ret2; bool need_abort = false; + struct fscrypt_name old_fname, new_fname; + struct qstr old_name, new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9014,6 +9092,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino != BTRFS_FIRST_FREE_OBJECTID)) return -EXDEV; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); + new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); + /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -9081,8 +9172,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, - old_ino, + ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) @@ -9095,8 +9185,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, &old_dentry->d_name, - new_ino, + ret = btrfs_insert_inode_ref(trans, root, &old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { @@ -9131,8 +9220,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - &old_dentry->d_name, - &old_rename_ctx); + &old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9147,8 +9235,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - &new_dentry->d_name, - &new_rename_ctx); + &new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9158,14 +9245,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_dentry->d_name, 0, old_idx); + &new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - &old_dentry->d_name, 0, new_idx); + &old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9208,6 +9295,8 @@ out_notrans: old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); + fscrypt_free_filename(&new_fname); + fscrypt_free_filename(&old_fname); return ret; } @@ -9247,6 +9336,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret; int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); + struct fscrypt_name old_fname, new_fname; + struct qstr old_name, new_name; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9263,21 +9354,32 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; + ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname); + if (ret) + return ret; + + ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname); + if (ret) { + fscrypt_free_filename(&old_fname); + return ret; + } + + old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); + new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, - &new_dentry->d_name); + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get * eexist without a new_inode */ if (WARN_ON(!new_inode)) { - return ret; + goto out_fscrypt_names; } } else { /* maybe -EOVERFLOW */ - return ret; + goto out_fscrypt_names; } } ret = 0; @@ -9360,8 +9462,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_dentry->d_name, - old_ino, + ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), index); if (ret) goto out_fail; @@ -9385,7 +9486,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), - &old_dentry->d_name, &rename_ctx); + &old_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9404,7 +9505,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - &new_dentry->d_name); + &new_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9416,7 +9517,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_dentry->d_name, 0, index); + &new_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9451,6 +9552,9 @@ out_notrans: out_whiteout_inode: if (flags & RENAME_WHITEOUT) iput(whiteout_args.inode); +out_fscrypt_names: + fscrypt_free_filename(&old_fname); + fscrypt_free_filename(&new_fname); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 7db9612f4d0e..3751b155e361 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -1611,10 +1612,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode; + struct inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; - struct dentry *dentry; struct extent_buffer *tmp; struct extent_buffer *old; struct timespec64 cur_time; @@ -1623,6 +1623,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 index = 0; u64 objectid; u64 root_flags; + unsigned int nofs_flags; + struct fscrypt_name fname; + struct qstr name; ASSERT(pending->path); path = pending->path; @@ -1630,9 +1633,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ASSERT(pending->root_item); new_root_item = pending->root_item; + /* + * We're inside a transaction and must make sure that any potential + * allocations with GFP_KERNEL in fscrypt won't recurse back to + * filesystem. + */ + nofs_flags = memalloc_nofs_save(); + pending->error = fscrypt_setup_filename(parent_inode, + &pending->dentry->d_name, 0, + &fname); + memalloc_nofs_restore(nofs_flags); + if (pending->error) + goto free_pending; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); + pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) - goto no_free_objectid; + goto free_fname; /* * Make qgroup to skip current new snapshot's qgroupid, as it is @@ -1661,8 +1678,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - dentry = pending->dentry; - parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) @@ -1678,7 +1693,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - &dentry->d_name, 0); + &name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1773,7 +1788,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - &dentry->d_name); + &name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1805,9 +1820,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, &dentry->d_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, - index); + ret = btrfs_insert_dir_item(trans, &name, BTRFS_I(parent_inode), &key, + BTRFS_FT_DIR, index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1816,7 +1830,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + - dentry->d_name.len * 2); + name.len * 2); parent_inode->i_mtime = current_time(parent_inode); parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); @@ -1848,7 +1862,9 @@ dir_item_existed: trans->bytes_reserved = 0; clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); -no_free_objectid: +free_fname: + fscrypt_free_filename(&fname); +free_pending: kfree(new_root_item); pending->root_item = NULL; btrfs_free_path(path); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b14ac7017f41..3f3d10a2dd0e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7446,9 +7446,16 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, if (old_dir && old_dir->logged_trans == trans->transid) { struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; + struct fscrypt_name fname; + struct qstr name; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); + ret = fscrypt_setup_filename(&old_dir->vfs_inode, + &old_dentry->d_name, 0, &fname); + if (ret) + goto out; + name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7468,6 +7475,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + fscrypt_free_filename(&fname); goto out; } @@ -7483,7 +7491,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - &old_dentry->d_name, old_dir_index); + &name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its @@ -7497,6 +7505,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, mutex_unlock(&old_dir->log_mutex); btrfs_free_path(path); + fscrypt_free_filename(&fname); if (ret < 0) goto out; } From 6db75318823a169e836a478ca57d6a7c0a156b77 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Thu, 20 Oct 2022 12:58:27 -0400 Subject: [PATCH 084/249] btrfs: use struct fscrypt_str instead of struct qstr While struct qstr is more natural without fscrypt, since it's provided by dentries, struct fscrypt_str is provided by the fscrypt handlers processing dentries, and is thus more natural in the fscrypt world. Replace all of the struct qstr uses with struct fscrypt_str. Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 19 +++++---- fs/btrfs/dir-item.c | 10 ++--- fs/btrfs/inode-item.c | 14 +++---- fs/btrfs/inode-item.h | 10 ++--- fs/btrfs/inode.c | 87 +++++++++++++++++------------------------- fs/btrfs/ioctl.c | 4 +- fs/btrfs/root-tree.c | 4 +- fs/btrfs/send.c | 4 +- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 13 +++---- fs/btrfs/tree-log.c | 42 ++++++++++---------- fs/btrfs/tree-log.h | 4 +- 12 files changed, 95 insertions(+), 118 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b2016c14488b..7c710893f4c6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1518,10 +1518,10 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -1550,23 +1550,23 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); /* dir-item.c */ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - const struct qstr *name, struct btrfs_inode *dir, + const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index); struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const struct qstr *name, int mod); + const struct fscrypt_str *name, int mod); struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const struct qstr *name, int mod); + u64 index, const struct fscrypt_str *name, int mod); struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, @@ -1647,10 +1647,10 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name); + const struct fscrypt_str *name); int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct qstr *name, int add_backref, u64 index); + const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); @@ -1676,7 +1676,6 @@ struct btrfs_new_inode_args { struct posix_acl *default_acl; struct posix_acl *acl; struct fscrypt_name fname; - struct qstr name; }; int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, unsigned int *trans_num_items); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 48a15af4ec57..23777021b331 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -106,7 +106,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, * Will return 0 or -ENOMEM */ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - const struct qstr *name, struct btrfs_inode *dir, + const struct fscrypt_str *name, struct btrfs_inode *dir, struct btrfs_key *location, u8 type, u64 index) { int ret = 0; @@ -208,7 +208,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir( struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - const struct qstr *name, + const struct fscrypt_str *name, int mod) { struct btrfs_key key; @@ -227,7 +227,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, } int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const struct qstr *name) + const struct fscrypt_str *name) { int ret; struct btrfs_key key; @@ -304,7 +304,7 @@ struct btrfs_dir_item * btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, - u64 index, const struct qstr *name, int mod) + u64 index, const struct fscrypt_str *name, int mod) { struct btrfs_dir_item *di; struct btrfs_key key; @@ -323,7 +323,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item * btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, - u64 dirid, const struct qstr *name) + u64 dirid, const struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key key; diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 577e39cfc411..d66bdbae5585 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -15,7 +15,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_inode_ref *ref; unsigned long ptr; @@ -42,7 +42,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_inode_extref *extref; unsigned long ptr; @@ -81,7 +81,7 @@ struct btrfs_inode_extref * btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow) { @@ -104,7 +104,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans, static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { @@ -174,7 +174,7 @@ out: } int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index) { struct btrfs_path *path; @@ -251,7 +251,7 @@ out: */ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { @@ -306,7 +306,7 @@ out: /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index) { struct btrfs_fs_info *fs_info = root->fs_info; diff --git a/fs/btrfs/inode-item.h b/fs/btrfs/inode-item.h index 3c657c670cfd..b80aeb715701 100644 --- a/fs/btrfs/inode-item.h +++ b/fs/btrfs/inode-item.h @@ -64,10 +64,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_truncate_control *control); int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 index); int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const struct qstr *name, + struct btrfs_root *root, const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, u64 *index); int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -80,15 +80,15 @@ struct btrfs_inode_extref *btrfs_lookup_inode_extref( struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - const struct qstr *name, + const struct fscrypt_str *name, u64 inode_objectid, u64 ref_objectid, int ins_len, int cow); struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, - const struct qstr *name); + const struct fscrypt_str *name); struct btrfs_inode_extref *btrfs_find_name_in_ext_backref( struct extent_buffer *leaf, int slot, u64 ref_objectid, - const struct qstr *name); + const struct fscrypt_str *name); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fa4b101bc1eb..7e76d5e91786 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4286,7 +4286,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_rename_ctx *rename_ctx) { struct btrfs_root *root = dir->root; @@ -4389,7 +4389,7 @@ out: int btrfs_unlink_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name) + const struct fscrypt_str *name) { int ret; @@ -4430,12 +4430,10 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) struct inode *inode = d_inode(dentry); int ret; struct fscrypt_name fname; - struct qstr name; ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* This needs to handle no-key deletions later on */ @@ -4449,7 +4447,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) 0); ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &name); + &fname.disk_name); if (ret) goto end_trans; @@ -4476,7 +4474,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - struct qstr name; u64 index; int ret; u64 objectid; @@ -4486,7 +4483,6 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (ret) return ret; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* This needs to handle no-key deletions later on */ @@ -4506,7 +4502,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - di = btrfs_lookup_dir_item(trans, root, path, dir_ino, &name, -1); + di = btrfs_lookup_dir_item(trans, root, path, dir_ino, + &fname.disk_name, -1); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -4532,7 +4529,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, * call btrfs_del_root_ref, and it _shouldn't_ fail. */ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) { - di = btrfs_search_dir_index_item(root, path, dir_ino, &name); + di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name); if (IS_ERR_OR_NULL(di)) { if (!di) ret = -ENOENT; @@ -4549,7 +4546,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } else { ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid, dir_ino, - &index, &name); + &index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -4562,7 +4559,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name.len * 2); + btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2); inode_inc_iversion(dir); dir->i_mtime = current_time(dir); dir->i_ctime = dir->i_mtime; @@ -4585,7 +4582,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) struct btrfs_path *path; struct btrfs_dir_item *di; struct btrfs_key key; - struct qstr name = QSTR_INIT("default", 7); + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; int ret; @@ -4836,7 +4833,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) struct btrfs_trans_handle *trans; u64 last_unlink_trans; struct fscrypt_name fname; - struct qstr name; if (inode->i_size > BTRFS_EMPTY_DIR_SIZE) return -ENOTEMPTY; @@ -4852,7 +4848,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); if (err) return err; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* This needs to handle no-key deletions later on */ @@ -4875,7 +4870,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) /* now the directory is empty */ err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)), - &name); + &fname.disk_name); if (!err) { btrfs_i_size_write(BTRFS_I(inode), 0); /* @@ -5569,7 +5564,6 @@ no_delete: static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { - struct qstr name; struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_root *root = BTRFS_I(dir)->root; @@ -5584,12 +5578,10 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, if (ret) goto out; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); - /* This needs to handle no-key deletions later on */ di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), - &name, 0); + &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; goto out; @@ -5601,7 +5593,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, name.name, btrfs_ino(BTRFS_I(dir)), + __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)), location->objectid, location->type, location->offset); } if (!ret) @@ -5631,14 +5623,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, int ret; int err = 0; struct fscrypt_name fname; - struct qstr name; ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); if (ret) return ret; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -5660,11 +5649,11 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || - btrfs_root_ref_name_len(leaf, ref) != name.len) + btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; - ret = memcmp_extent_buffer(leaf, name.name, (unsigned long)(ref + 1), - name.len); + ret = memcmp_extent_buffer(leaf, fname.disk_name.name, + (unsigned long)(ref + 1), fname.disk_name.len); if (ret) goto out; @@ -6297,7 +6286,6 @@ int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, &args->fname); if (ret) return ret; - args->name = (struct qstr)FSTR_TO_QSTR(&args->fname.disk_name); } ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); @@ -6380,7 +6368,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, { struct inode *dir = args->dir; struct inode *inode = args->inode; - const struct qstr *name = args->orphan ? NULL : &args->dentry->d_name; + const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name; struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_root *root; struct btrfs_inode_item *inode_item; @@ -6615,7 +6603,7 @@ out: */ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct qstr *name, int add_backref, u64 index) + const struct fscrypt_str *name, int add_backref, u64 index) { int ret = 0; struct btrfs_key key; @@ -6771,7 +6759,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct inode *inode = d_inode(old_dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct fscrypt_name fname; - struct qstr name; u64 index; int err; int drop_inode = 0; @@ -6787,8 +6774,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (err) goto fail; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); - err = btrfs_set_inode_index(BTRFS_I(dir), &index); if (err) goto fail; @@ -6815,7 +6800,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 1, index); + &fname.disk_name, 1, index); if (err) { drop_inode = 1; @@ -9080,7 +9065,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret2; bool need_abort = false; struct fscrypt_name old_fname, new_fname; - struct qstr old_name, new_name; + struct fscrypt_str *old_name, *new_name; /* * For non-subvolumes allow exchange only within one subvolume, in the @@ -9102,8 +9087,8 @@ static int btrfs_rename_exchange(struct inode *old_dir, return ret; } - old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); - new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); + old_name = &old_fname.disk_name; + new_name = &new_fname.disk_name; /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || @@ -9172,7 +9157,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, + ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino, btrfs_ino(BTRFS_I(new_dir)), old_idx); if (ret) @@ -9185,7 +9170,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, root, &old_name, new_ino, + ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); if (ret) { @@ -9220,7 +9205,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), - &old_name, &old_rename_ctx); + old_name, &old_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9235,7 +9220,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), - &new_name, &new_rename_ctx); + new_name, &new_rename_ctx); if (!ret) ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode)); } @@ -9245,14 +9230,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_name, 0, old_idx); + new_name, 0, old_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - &old_name, 0, new_idx); + old_name, 0, new_idx); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -9337,7 +9322,6 @@ static int btrfs_rename(struct user_namespace *mnt_userns, int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); struct fscrypt_name old_fname, new_fname; - struct qstr old_name, new_name; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9364,12 +9348,8 @@ static int btrfs_rename(struct user_namespace *mnt_userns, return ret; } - old_name = (struct qstr)FSTR_TO_QSTR(&old_fname.disk_name); - new_name = (struct qstr)FSTR_TO_QSTR(&new_fname.disk_name); - /* check for collisions, even if the name isn't there */ - ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_name); - + ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name); if (ret) { if (ret == -EEXIST) { /* we shouldn't get @@ -9462,8 +9442,9 @@ static int btrfs_rename(struct user_namespace *mnt_userns, /* force full log commit if subvolume involved. */ btrfs_set_log_full_commit(trans); } else { - ret = btrfs_insert_inode_ref(trans, dest, &new_name, old_ino, - btrfs_ino(BTRFS_I(new_dir)), index); + ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name, + old_ino, btrfs_ino(BTRFS_I(new_dir)), + index); if (ret) goto out_fail; } @@ -9486,7 +9467,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), - &old_name, &rename_ctx); + &old_fname.disk_name, &rename_ctx); if (!ret) ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode)); } @@ -9505,7 +9486,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(d_inode(new_dentry)), - &new_name); + &new_fname.disk_name); } if (!ret && new_inode->i_nlink == 0) ret = btrfs_orphan_add(trans, @@ -9517,7 +9498,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_name, 0, index); + &new_fname.disk_name, 0, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fde55d0f61b4..2132e3e15986 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -951,7 +951,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, struct inode *dir = d_inode(parent->dentry); struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct dentry *dentry; - struct qstr name_str = QSTR_INIT(name, namelen); + struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen); int error; error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); @@ -3779,7 +3779,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) struct btrfs_trans_handle *trans; struct btrfs_path *path = NULL; struct btrfs_disk_key disk_key; - struct qstr name = QSTR_INIT("default", 7); + struct fscrypt_str name = FSTR_INIT("default", 7); u64 objectid = 0; u64 dir_id; int ret; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 848a720747af..6aab98114253 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -331,7 +331,7 @@ out: int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 *sequence, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_path *path; @@ -403,7 +403,7 @@ out: */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_root *tree_root = trans->fs_info->tree_root; struct btrfs_key key; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d0feeea2ba1c..0ebca9dba5ab 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1597,7 +1597,7 @@ static int gen_unique_name(struct send_ctx *sctx, return -ENOMEM; while (1) { - struct qstr tmp_name; + struct fscrypt_str tmp_name; len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu", ino, gen, idx); @@ -1757,7 +1757,7 @@ static int lookup_dir_item_inode(struct btrfs_root *root, struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_path *path; - struct qstr name_str = QSTR_INIT(name, name_len); + struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len); path = alloc_path_for_send(); if (!path) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 13d33e5707de..33bf211699f8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1423,7 +1423,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec struct btrfs_dir_item *di; struct btrfs_path *path; struct btrfs_key location; - struct qstr name = QSTR_INIT("default", 7); + struct fscrypt_str name = FSTR_INIT("default", 7); u64 dir_id; path = btrfs_alloc_path(); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 3751b155e361..0d44d50dc3be 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1625,7 +1625,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; unsigned int nofs_flags; struct fscrypt_name fname; - struct qstr name; ASSERT(pending->path); path = pending->path; @@ -1645,7 +1644,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memalloc_nofs_restore(nofs_flags); if (pending->error) goto free_pending; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); pending->error = btrfs_get_free_objectid(tree_root, &objectid); if (pending->error) @@ -1693,7 +1691,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(BTRFS_I(parent_inode)), - &name, 0); + &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; @@ -1788,7 +1786,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, ret = btrfs_add_root_ref(trans, objectid, parent_root->root_key.objectid, btrfs_ino(BTRFS_I(parent_inode)), index, - &name); + &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1820,8 +1818,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (ret < 0) goto fail; - ret = btrfs_insert_dir_item(trans, &name, BTRFS_I(parent_inode), &key, - BTRFS_FT_DIR, index); + ret = btrfs_insert_dir_item(trans, &fname.disk_name, + BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + index); /* We have check then name at the beginning, so it is impossible. */ BUG_ON(ret == -EEXIST || ret == -EOVERFLOW); if (ret) { @@ -1830,7 +1829,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + - name.len * 2); + fname.disk_name.len * 2); parent_inode->i_mtime = current_time(parent_inode); parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 3f3d10a2dd0e..7002cc3315da 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -598,7 +598,7 @@ static int overwrite_item(struct btrfs_trans_handle *trans, } static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, - struct qstr *name) + struct fscrypt_str *name) { char *buf; @@ -917,7 +917,7 @@ out: static int unlink_inode_for_log_replay(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct qstr *name) + const struct fscrypt_str *name) { int ret; @@ -948,7 +948,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, { struct btrfs_root *root = dir->root; struct inode *inode; - struct qstr name; + struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; int ret; @@ -989,7 +989,7 @@ out: static noinline int inode_in_dir(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, u64 objectid, u64 index, - struct qstr *name) + struct fscrypt_str *name) { struct btrfs_dir_item *di; struct btrfs_key location; @@ -1036,7 +1036,7 @@ out: static noinline int backref_in_log(struct btrfs_root *log, struct btrfs_key *key, u64 ref_objectid, - const struct qstr *name) + const struct fscrypt_str *name) { struct btrfs_path *path; int ret; @@ -1072,7 +1072,7 @@ static inline int __add_inode_ref(struct btrfs_trans_handle *trans, struct btrfs_inode *dir, struct btrfs_inode *inode, u64 inode_objectid, u64 parent_objectid, - u64 ref_index, struct qstr *name) + u64 ref_index, struct fscrypt_str *name) { int ret; struct extent_buffer *leaf; @@ -1106,7 +1106,7 @@ again: ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); ptr_end = ptr + btrfs_item_size(leaf, path->slots[0]); while (ptr < ptr_end) { - struct qstr victim_name; + struct fscrypt_str victim_name; victim_ref = (struct btrfs_inode_ref *)ptr; ret = read_alloc_one_name(leaf, (victim_ref + 1), @@ -1156,7 +1156,7 @@ again: base = btrfs_item_ptr_offset(leaf, path->slots[0]); while (cur_offset < item_size) { - struct qstr victim_name; + struct fscrypt_str victim_name; extref = (struct btrfs_inode_extref *)(base + cur_offset); @@ -1231,7 +1231,7 @@ next: } static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - struct qstr *name, u64 *index, + struct fscrypt_str *name, u64 *index, u64 *parent_objectid) { struct btrfs_inode_extref *extref; @@ -1253,7 +1253,7 @@ static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, } static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr, - struct qstr *name, u64 *index) + struct fscrypt_str *name, u64 *index) { struct btrfs_inode_ref *ref; int ret; @@ -1305,7 +1305,7 @@ again: ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]); ref_end = ref_ptr + btrfs_item_size(eb, path->slots[0]); while (ref_ptr < ref_end) { - struct qstr name; + struct fscrypt_str name; u64 parent_id; if (key->type == BTRFS_INODE_EXTREF_KEY) { @@ -1373,7 +1373,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; - struct qstr name; + struct fscrypt_str name; int ret; int log_ref_ver = 0; u64 parent_objectid; @@ -1767,7 +1767,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, static noinline int insert_one_name(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 dirid, u64 index, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_key *location) { struct inode *inode; @@ -1845,7 +1845,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di, struct btrfs_key *key) { - struct qstr name; + struct fscrypt_str name; struct btrfs_dir_item *dir_dst_di; struct btrfs_dir_item *index_dst_di; bool dir_dst_matches = false; @@ -2125,7 +2125,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; - struct qstr name; + struct fscrypt_str name; struct inode *inode = NULL; struct btrfs_key location; @@ -3423,7 +3423,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, u64 dir_ino, - const struct qstr *name, + const struct fscrypt_str *name, u64 index) { struct btrfs_dir_item *di; @@ -3470,7 +3470,7 @@ static int del_logged_dentry(struct btrfs_trans_handle *trans, */ void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index) { struct btrfs_path *path; @@ -3509,7 +3509,7 @@ out_unlock: /* see comments for btrfs_del_dir_entries_in_log */ void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid) { struct btrfs_root *log; @@ -5244,7 +5244,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, u32 this_len; unsigned long name_ptr; struct btrfs_dir_item *di; - struct qstr name_str; + struct fscrypt_str name_str; if (key->type == BTRFS_INODE_REF_KEY) { struct btrfs_inode_ref *iref; @@ -7447,7 +7447,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_root *log = old_dir->root->log_root; struct btrfs_path *path; struct fscrypt_name fname; - struct qstr name; ASSERT(old_dir_index >= BTRFS_DIR_START_INDEX); @@ -7455,7 +7454,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, &old_dentry->d_name, 0, &fname); if (ret) goto out; - name = (struct qstr)FSTR_TO_QSTR(&fname.disk_name); /* * We have two inodes to update in the log, the old directory and * the inode that got renamed, so we must pin the log to prevent @@ -7491,7 +7489,7 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, */ mutex_lock(&old_dir->log_mutex); ret = del_logged_dentry(trans, log, path, btrfs_ino(old_dir), - &name, old_dir_index); + &fname.disk_name, old_dir_index); if (ret > 0) { /* * The dentry does not exist in the log, so record its diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 8c9a387151b0..85b43075ac58 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -87,11 +87,11 @@ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx); void btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *dir, u64 index); void btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct qstr *name, + const struct fscrypt_str *name, struct btrfs_inode *inode, u64 dirid); void btrfs_end_log_trans(struct btrfs_root *root); void btrfs_pin_log_trans(struct btrfs_root *root); From 94a48aef49f235cc1efc74dc18e7708ca3b8d698 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 20 Oct 2022 12:58:28 -0400 Subject: [PATCH 085/249] btrfs: extend btrfs_dir_item type to store encryption status For directories with encrypted files/filenames, we need to store a flag indicating this fact. There's no room in other fields, so we'll need to borrow a bit from dir_type. Since it's now a combination of type and flags, we rename it to dir_flags to reflect its new usage. The new flag, FT_ENCRYPTED, indicates a directory containing encrypted data, which is orthogonal to file type; therefore, add the new flag, and make conversion from directory type to file type strip the flag. As the file types almost never change we can afford to use the bits. Actual usage will be guarded behind an incompat bit, this patch only adds the support for later use by fscrypt. Signed-off-by: Omar Sandoval Signed-off-by: Sweet Tea Dorminy Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 15 +++++++++++++-- fs/btrfs/delayed-inode.c | 6 +++--- fs/btrfs/delayed-inode.h | 2 +- fs/btrfs/dir-item.c | 7 +++++-- fs/btrfs/inode.c | 13 +++++++------ fs/btrfs/print-tree.c | 4 ++-- fs/btrfs/send.c | 2 +- fs/btrfs/tree-checker.c | 2 +- fs/btrfs/tree-log.c | 20 ++++++++++---------- include/uapi/linux/btrfs_tree.h | 7 +++++++ 10 files changed, 50 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index e8bbd4d80161..cb59b69d2af1 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -484,14 +484,25 @@ BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); -BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); -BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_STACK_FUNCS(stack_dir_flags, struct btrfs_dir_item, type, 8); BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item, data_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item, name_len, 16); BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item, transid, 64); +static inline u8 btrfs_dir_ftype(const struct extent_buffer *eb, + const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_dir_flags(eb, item)); +} + +static inline u8 btrfs_stack_dir_ftype(const struct btrfs_dir_item *item) +{ + return btrfs_dir_flags_to_ftype(btrfs_stack_dir_flags(item)); +} + static inline void btrfs_dir_item_key(const struct extent_buffer *eb, const struct btrfs_dir_item *item, struct btrfs_disk_key *key) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 012c96de4701..f93d2695e423 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1416,7 +1416,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index) { struct btrfs_fs_info *fs_info = trans->fs_info; @@ -1447,7 +1447,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_transid(dir_item, trans->transid); btrfs_set_stack_dir_data_len(dir_item, 0); btrfs_set_stack_dir_name_len(dir_item, name_len); - btrfs_set_stack_dir_type(dir_item, type); + btrfs_set_stack_dir_flags(dir_item, flags); memcpy((char *)(dir_item + 1), name, name_len); data_len = delayed_item->data_len + sizeof(struct btrfs_item); @@ -1757,7 +1757,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, name = (char *)(di + 1); name_len = btrfs_stack_dir_name_len(di); - d_type = fs_ftype_to_dtype(di->type); + d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type)); btrfs_disk_key_to_cpu(&location, &di->location); over = !dir_emit(ctx, name, name_len, diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 0163ca637a96..4f21daa3dbc7 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -113,7 +113,7 @@ static inline void btrfs_init_delayed_root( int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, const char *name, int name_len, struct btrfs_inode *dir, - struct btrfs_disk_key *disk_key, u8 type, + struct btrfs_disk_key *disk_key, u8 flags, u64 index); int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 23777021b331..ca69fb35a2cc 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -83,7 +83,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, leaf = path->nodes[0]; btrfs_cpu_key_to_disk(&disk_key, &location); btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR); btrfs_set_dir_name_len(leaf, dir_item, name_len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); btrfs_set_dir_data_len(leaf, dir_item, data_len); @@ -140,9 +140,12 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, goto out_free; } + if (IS_ENCRYPTED(&dir->vfs_inode)) + type |= BTRFS_FT_ENCRYPTED; + leaf = path->nodes[0]; btrfs_set_dir_item_key(leaf, dir_item, &disk_key); - btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_flags(leaf, dir_item, type); btrfs_set_dir_data_len(leaf, dir_item, 0); btrfs_set_dir_name_len(leaf, dir_item, name->len); btrfs_set_dir_transid(leaf, dir_item, trans->transid); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7e76d5e91786..78867a084428 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5597,7 +5597,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, location->objectid, location->type, location->offset); } if (!ret) - *type = btrfs_dir_type(path->nodes[0], di); + *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); btrfs_free_path(path); @@ -6046,6 +6046,7 @@ again: btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; struct extent_buffer *leaf = path->nodes[0]; + u8 ftype; if (found_key.objectid != key.objectid) break; @@ -6069,13 +6070,13 @@ again: goto again; } + ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di)); entry = addr; - put_unaligned(name_len, &entry->name_len); name_ptr = (char *)(entry + 1); - read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1), - name_len); - put_unaligned(fs_ftype_to_dtype(btrfs_dir_type(leaf, di)), - &entry->type); + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + put_unaligned(name_len, &entry->name_len); + put_unaligned(fs_ftype_to_dtype(ftype), &entry->type); btrfs_dir_item_key_to_cpu(leaf, di, &location); put_unaligned(location.objectid, &entry->ino); put_unaligned(found_key.offset, &entry->offset); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index aab7d30eed55..1a2350fd68be 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -242,9 +242,9 @@ void btrfs_print_leaf(struct extent_buffer *l) case BTRFS_DIR_ITEM_KEY: di = btrfs_item_ptr(l, i, struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(l, di, &found_key); - pr_info("\t\tdir oid %llu type %u\n", + pr_info("\t\tdir oid %llu flags %u\n", found_key.objectid, - btrfs_dir_type(l, di)); + btrfs_dir_flags(l, di)); break; case BTRFS_ROOT_ITEM_KEY: ri = btrfs_item_ptr(l, i, struct btrfs_root_item); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0ebca9dba5ab..2ece3a030a66 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1094,7 +1094,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, data_len = btrfs_dir_data_len(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &di_key); - if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) { + if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) { if (name_len > XATTR_NAME_MAX) { ret = -ENAMETOOLONG; goto out; diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 11cafc520b47..1c2d418dda6a 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -531,7 +531,7 @@ static int check_dir_item(struct extent_buffer *leaf, } /* dir type check */ - dir_type = btrfs_dir_type(leaf, di); + dir_type = btrfs_dir_ftype(leaf, di); if (unlikely(dir_type >= BTRFS_FT_MAX)) { dir_item_err(leaf, slot, "invalid dir item type, have %u expect [0, %u)", diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7002cc3315da..a5e56a678af2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1799,7 +1799,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *dst_di, const struct btrfs_key *log_key, - u8 log_type, + u8 log_flags, bool exists) { struct btrfs_key found_key; @@ -1809,7 +1809,7 @@ static int delete_conflicting_dir_entry(struct btrfs_trans_handle *trans, if (found_key.objectid == log_key->objectid && found_key.type == log_key->type && found_key.offset == log_key->offset && - btrfs_dir_type(path->nodes[0], dst_di) == log_type) + btrfs_dir_flags(path->nodes[0], dst_di) == log_flags) return 1; /* @@ -1853,7 +1853,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, struct btrfs_key log_key; struct btrfs_key search_key; struct inode *dir; - u8 log_type; + u8 log_flags; bool exists; int ret; bool update_size = true; @@ -1867,7 +1867,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, if (ret) goto out; - log_type = btrfs_dir_type(eb, di); + log_flags = btrfs_dir_flags(eb, di); btrfs_dir_item_key_to_cpu(eb, di, &log_key); ret = btrfs_lookup_inode(trans, root, path, &log_key, 0); btrfs_release_path(path); @@ -1883,8 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, goto out; } else if (dir_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, log_type, - exists); + dir_dst_di, &log_key, + log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1901,7 +1901,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, } else if (index_dst_di) { ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, index_dst_di, &log_key, - log_type, exists); + log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -2010,7 +2010,7 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, * to ever delete the parent directory has it would result in stale * dentries that can never be deleted. */ - if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) { + if (ret == 1 && btrfs_dir_ftype(eb, di) != BTRFS_FT_DIR) { struct btrfs_path *fixup_path; struct btrfs_key di_key; @@ -5452,7 +5452,7 @@ again: } di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item); - type = btrfs_dir_type(leaf, di); + type = btrfs_dir_ftype(leaf, di); if (btrfs_dir_transid(leaf, di) < trans->transid) continue; btrfs_dir_item_key_to_cpu(leaf, di, &di_key); @@ -6292,7 +6292,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, continue; } - if (btrfs_stack_dir_type(dir_item) == BTRFS_FT_DIR) + if (btrfs_stack_dir_ftype(dir_item) == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 4809272f5063..29895ffa470d 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -376,6 +376,13 @@ enum btrfs_csum_type { #define BTRFS_FT_SYMLINK 7 #define BTRFS_FT_XATTR 8 #define BTRFS_FT_MAX 9 +/* Directory contains encrypted data */ +#define BTRFS_FT_ENCRYPTED 0x80 + +static inline __u8 btrfs_dir_flags_to_ftype(__u8 flags) +{ + return flags & ~BTRFS_FT_ENCRYPTED; +} /* * Inode flags From a56159d4080b793ee8dc3d3d315579801ad9096c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:52 -0400 Subject: [PATCH 086/249] btrfs: move btrfs_fs_info declarations into fs.h Now that we have a lot of the fs_info related helpers and stuff isolated, copy these over to fs.h out of ctree.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ reformat comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 659 +--------------------------------------------- fs/btrfs/fs.h | 660 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 661 insertions(+), 658 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 7c710893f4c6..ace6f41612e5 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -36,6 +36,7 @@ #include "block-rsv.h" #include "locking.h" #include "misc.h" +#include "fs.h" struct btrfs_trans_handle; struct btrfs_transaction; @@ -53,14 +54,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -#define BTRFS_OLDEST_GENERATION 0ULL - -#define BTRFS_EMPTY_DIR_SIZE 0 - -#define BTRFS_DIRTY_METADATA_THRESH SZ_32M - -#define BTRFS_MAX_EXTENT_SIZE SZ_128M - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -68,17 +61,6 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) sizeof(struct btrfs_stripe) * (num_stripes - 1); } -#define BTRFS_SUPER_INFO_OFFSET SZ_64K -#define BTRFS_SUPER_INFO_SIZE 4096 -static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); - -/* - * The reserved space at the beginning of each device. - * It covers the primary super block and leaves space for potential use by other - * tools like bootloaders or to lower potential damage of accidental overwrite. - */ -#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, @@ -137,645 +119,6 @@ struct btrfs_path { unsigned int nowait:1; }; -struct btrfs_dev_replace { - u64 replace_state; /* see #define above */ - time64_t time_started; /* seconds since 1-Jan-1970 */ - time64_t time_stopped; /* seconds since 1-Jan-1970 */ - atomic64_t num_write_errors; - atomic64_t num_uncorrectable_read_errors; - - u64 cursor_left; - u64 committed_cursor_left; - u64 cursor_left_last_write_of_item; - u64 cursor_right; - - u64 cont_reading_from_srcdev_mode; /* see #define above */ - - int is_valid; - int item_needs_writeback; - struct btrfs_device *srcdev; - struct btrfs_device *tgtdev; - - struct mutex lock_finishing_cancel_unmount; - struct rw_semaphore rwsem; - - struct btrfs_scrub_progress scrub_progress; - - struct percpu_counter bio_counter; - wait_queue_head_t replace_wait; -}; - -/* - * free clusters are used to claim free space in relatively large chunks, - * allowing us to do less seeky writes. They are used for all metadata - * allocations. In ssd_spread mode they are also used for data allocations. - */ -struct btrfs_free_cluster { - spinlock_t lock; - spinlock_t refill_lock; - struct rb_root root; - - /* largest extent in this cluster */ - u64 max_size; - - /* first extent starting offset */ - u64 window_start; - - /* We did a full search and couldn't create a cluster */ - bool fragmented; - - struct btrfs_block_group *block_group; - /* - * when a cluster is allocated from a block group, we put the - * cluster onto a list in the block group so that it can - * be freed before the block group is freed. - */ - struct list_head block_group_list; -}; - -/* Discard control. */ -/* - * Async discard uses multiple lists to differentiate the discard filter - * parameters. Index 0 is for completely free block groups where we need to - * ensure the entire block group is trimmed without being lossy. Indices - * afterwards represent monotonically decreasing discard filter sizes to - * prioritize what should be discarded next. - */ -#define BTRFS_NR_DISCARD_LISTS 3 -#define BTRFS_DISCARD_INDEX_UNUSED 0 -#define BTRFS_DISCARD_INDEX_START 1 - -struct btrfs_discard_ctl { - struct workqueue_struct *discard_workers; - struct delayed_work work; - spinlock_t lock; - struct btrfs_block_group *block_group; - struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; - u64 prev_discard; - u64 prev_discard_time; - atomic_t discardable_extents; - atomic64_t discardable_bytes; - u64 max_discard_size; - u64 delay_ms; - u32 iops_limit; - u32 kbps_limit; - u64 discard_extent_bytes; - u64 discard_bitmap_bytes; - atomic64_t discard_bytes_saved; -}; - -/* - * Exclusive operations (device replace, resize, device add/remove, balance) - */ -enum btrfs_exclusive_operation { - BTRFS_EXCLOP_NONE, - BTRFS_EXCLOP_BALANCE_PAUSED, - BTRFS_EXCLOP_BALANCE, - BTRFS_EXCLOP_DEV_ADD, - BTRFS_EXCLOP_DEV_REMOVE, - BTRFS_EXCLOP_DEV_REPLACE, - BTRFS_EXCLOP_RESIZE, - BTRFS_EXCLOP_SWAP_ACTIVATE, -}; - -/* Store data about transaction commits, exported via sysfs. */ -struct btrfs_commit_stats { - /* Total number of commits */ - u64 commit_count; - /* The maximum commit duration so far in ns */ - u64 max_commit_dur; - /* The last commit duration in ns */ - u64 last_commit_dur; - /* The total commit duration in ns */ - u64 total_commit_dur; -}; - -struct btrfs_fs_info { - u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; - unsigned long flags; - struct btrfs_root *tree_root; - struct btrfs_root *chunk_root; - struct btrfs_root *dev_root; - struct btrfs_root *fs_root; - struct btrfs_root *quota_root; - struct btrfs_root *uuid_root; - struct btrfs_root *data_reloc_root; - struct btrfs_root *block_group_root; - - /* the log root tree is a directory of all the other log roots */ - struct btrfs_root *log_root_tree; - - /* The tree that holds the global roots (csum, extent, etc) */ - rwlock_t global_root_lock; - struct rb_root global_root_tree; - - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; - - /* block group cache stuff */ - rwlock_t block_group_cache_lock; - struct rb_root_cached block_group_cache_tree; - - /* keep track of unallocated space */ - atomic64_t free_chunk_space; - - /* Track ranges which are used by log trees blocks/logged data extents */ - struct extent_io_tree excluded_extents; - - /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; - - /* - * block reservation for extent, checksum, root tree and - * delayed dir index item - */ - struct btrfs_block_rsv global_block_rsv; - /* block reservation for metadata operations */ - struct btrfs_block_rsv trans_block_rsv; - /* block reservation for chunk tree */ - struct btrfs_block_rsv chunk_block_rsv; - /* block reservation for delayed operations */ - struct btrfs_block_rsv delayed_block_rsv; - /* block reservation for delayed refs */ - struct btrfs_block_rsv delayed_refs_rsv; - - struct btrfs_block_rsv empty_block_rsv; - - u64 generation; - u64 last_trans_committed; - /* - * Generation of the last transaction used for block group relocation - * since the filesystem was last mounted (or 0 if none happened yet). - * Must be written and read while holding btrfs_fs_info::commit_root_sem. - */ - u64 last_reloc_trans; - u64 avg_delayed_ref_runtime; - - /* - * this is updated to the current trans every time a full commit - * is required instead of the faster short fsync log commits - */ - u64 last_trans_log_full_commit; - unsigned long mount_opt; - - unsigned long compress_type:4; - unsigned int compress_level; - u32 commit_interval; - /* - * It is a suggestive number, the read side is safe even it gets a - * wrong number because we will write out the data into a regular - * extent. The write side(mount/remount) is under ->s_umount lock, - * so it is also safe. - */ - u64 max_inline; - - struct btrfs_transaction *running_transaction; - wait_queue_head_t transaction_throttle; - wait_queue_head_t transaction_wait; - wait_queue_head_t transaction_blocked_wait; - wait_queue_head_t async_submit_wait; - - /* - * Used to protect the incompat_flags, compat_flags, compat_ro_flags - * when they are updated. - * - * Because we do not clear the flags for ever, so we needn't use - * the lock on the read side. - * - * We also needn't use the lock when we mount the fs, because - * there is no other task which will update the flag. - */ - spinlock_t super_lock; - struct btrfs_super_block *super_copy; - struct btrfs_super_block *super_for_commit; - struct super_block *sb; - struct inode *btree_inode; - struct mutex tree_log_mutex; - struct mutex transaction_kthread_mutex; - struct mutex cleaner_mutex; - struct mutex chunk_mutex; - - /* - * this is taken to make sure we don't set block groups ro after - * the free space cache has been allocated on them - */ - struct mutex ro_block_group_mutex; - - /* this is used during read/modify/write to make sure - * no two ios are trying to mod the same stripe at the same - * time - */ - struct btrfs_stripe_hash_table *stripe_hash_table; - - /* - * this protects the ordered operations list only while we are - * processing all of the entries on it. This way we make - * sure the commit code doesn't find the list temporarily empty - * because another function happens to be doing non-waiting preflush - * before jumping into the main commit. - */ - struct mutex ordered_operations_mutex; - - struct rw_semaphore commit_root_sem; - - struct rw_semaphore cleanup_work_sem; - - struct rw_semaphore subvol_sem; - - spinlock_t trans_lock; - /* - * the reloc mutex goes with the trans lock, it is taken - * during commit to protect us from the relocation code - */ - struct mutex reloc_mutex; - - struct list_head trans_list; - struct list_head dead_roots; - struct list_head caching_block_groups; - - spinlock_t delayed_iput_lock; - struct list_head delayed_iputs; - atomic_t nr_delayed_iputs; - wait_queue_head_t delayed_iputs_wait; - - atomic64_t tree_mod_seq; - - /* this protects tree_mod_log and tree_mod_seq_list */ - rwlock_t tree_mod_log_lock; - struct rb_root tree_mod_log; - struct list_head tree_mod_seq_list; - - atomic_t async_delalloc_pages; - - /* - * this is used to protect the following list -- ordered_roots. - */ - spinlock_t ordered_root_lock; - - /* - * all fs/file tree roots in which there are data=ordered extents - * pending writeback are added into this list. - * - * these can span multiple transactions and basically include - * every dirty data page that isn't from nodatacow - */ - struct list_head ordered_roots; - - struct mutex delalloc_root_mutex; - spinlock_t delalloc_root_lock; - /* all fs/file tree roots that have delalloc inodes. */ - struct list_head delalloc_roots; - - /* - * there is a pool of worker threads for checksumming during writes - * and a pool for checksumming after reads. This is because readers - * can run with FS locks held, and the writers may be waiting for - * those locks. We don't want ordering in the pending list to cause - * deadlocks, and so the two are serviced separately. - * - * A third pool does submit_bio to avoid deadlocking with the other - * two - */ - struct btrfs_workqueue *workers; - struct btrfs_workqueue *hipri_workers; - struct btrfs_workqueue *delalloc_workers; - struct btrfs_workqueue *flush_workers; - struct workqueue_struct *endio_workers; - struct workqueue_struct *endio_meta_workers; - struct workqueue_struct *endio_raid56_workers; - struct workqueue_struct *rmw_workers; - struct workqueue_struct *compressed_write_workers; - struct btrfs_workqueue *endio_write_workers; - struct btrfs_workqueue *endio_freespace_worker; - struct btrfs_workqueue *caching_workers; - - /* - * fixup workers take dirty pages that didn't properly go through - * the cow mechanism and make them safe to write. It happens - * for the sys_munmap function call path - */ - struct btrfs_workqueue *fixup_workers; - struct btrfs_workqueue *delayed_workers; - - struct task_struct *transaction_kthread; - struct task_struct *cleaner_kthread; - u32 thread_pool_size; - - struct kobject *space_info_kobj; - struct kobject *qgroups_kobj; - struct kobject *discard_kobj; - - /* used to keep from writing metadata until there is a nice batch */ - struct percpu_counter dirty_metadata_bytes; - struct percpu_counter delalloc_bytes; - struct percpu_counter ordered_bytes; - s32 dirty_metadata_batch; - s32 delalloc_batch; - - struct list_head dirty_cowonly_roots; - - struct btrfs_fs_devices *fs_devices; - - /* - * The space_info list is effectively read only after initial - * setup. It is populated at mount time and cleaned up after - * all block groups are removed. RCU is used to protect it. - */ - struct list_head space_info; - - struct btrfs_space_info *data_sinfo; - - struct reloc_control *reloc_ctl; - - /* data_alloc_cluster is only used in ssd_spread mode */ - struct btrfs_free_cluster data_alloc_cluster; - - /* all metadata allocations go through this cluster */ - struct btrfs_free_cluster meta_alloc_cluster; - - /* auto defrag inodes go here */ - spinlock_t defrag_inodes_lock; - struct rb_root defrag_inodes; - atomic_t defrag_running; - - /* Used to protect avail_{data, metadata, system}_alloc_bits */ - seqlock_t profiles_lock; - /* - * these three are in extended format (availability of single - * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other - * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits) - */ - u64 avail_data_alloc_bits; - u64 avail_metadata_alloc_bits; - u64 avail_system_alloc_bits; - - /* restriper state */ - spinlock_t balance_lock; - struct mutex balance_mutex; - atomic_t balance_pause_req; - atomic_t balance_cancel_req; - struct btrfs_balance_control *balance_ctl; - wait_queue_head_t balance_wait_q; - - /* Cancellation requests for chunk relocation */ - atomic_t reloc_cancel_req; - - u32 data_chunk_allocations; - u32 metadata_ratio; - - void *bdev_holder; - - /* private scrub information */ - struct mutex scrub_lock; - atomic_t scrubs_running; - atomic_t scrub_pause_req; - atomic_t scrubs_paused; - atomic_t scrub_cancel_req; - wait_queue_head_t scrub_pause_wait; - /* - * The worker pointers are NULL iff the refcount is 0, ie. scrub is not - * running. - */ - refcount_t scrub_workers_refcnt; - struct workqueue_struct *scrub_workers; - struct workqueue_struct *scrub_wr_completion_workers; - struct workqueue_struct *scrub_parity_workers; - struct btrfs_subpage_info *subpage_info; - - struct btrfs_discard_ctl discard_ctl; - -#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY - u32 check_integrity_print_mask; -#endif - /* is qgroup tracking in a consistent state? */ - u64 qgroup_flags; - - /* holds configuration and tracking. Protected by qgroup_lock */ - struct rb_root qgroup_tree; - spinlock_t qgroup_lock; - - /* - * used to avoid frequently calling ulist_alloc()/ulist_free() - * when doing qgroup accounting, it must be protected by qgroup_lock. - */ - struct ulist *qgroup_ulist; - - /* - * Protect user change for quota operations. If a transaction is needed, - * it must be started before locking this lock. - */ - struct mutex qgroup_ioctl_lock; - - /* list of dirty qgroups to be written at next commit */ - struct list_head dirty_qgroups; - - /* used by qgroup for an efficient tree traversal */ - u64 qgroup_seq; - - /* qgroup rescan items */ - struct mutex qgroup_rescan_lock; /* protects the progress item */ - struct btrfs_key qgroup_rescan_progress; - struct btrfs_workqueue *qgroup_rescan_workers; - struct completion qgroup_rescan_completion; - struct btrfs_work qgroup_rescan_work; - bool qgroup_rescan_running; /* protected by qgroup_rescan_lock */ - u8 qgroup_drop_subtree_thres; - - /* filesystem state */ - unsigned long fs_state; - - struct btrfs_delayed_root *delayed_root; - - /* Extent buffer radix tree */ - spinlock_t buffer_lock; - /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; - - /* next backup root to be overwritten */ - int backup_root_index; - - /* device replace state */ - struct btrfs_dev_replace dev_replace; - - struct semaphore uuid_tree_rescan_sem; - - /* Used to reclaim the metadata space in the background. */ - struct work_struct async_reclaim_work; - struct work_struct async_data_reclaim_work; - struct work_struct preempt_reclaim_work; - - /* Reclaim partially filled block groups in the background */ - struct work_struct reclaim_bgs_work; - struct list_head reclaim_bgs; - int bg_reclaim_threshold; - - spinlock_t unused_bgs_lock; - struct list_head unused_bgs; - struct mutex unused_bg_unpin_mutex; - /* Protect block groups that are going to be deleted */ - struct mutex reclaim_bgs_lock; - - /* Cached block sizes */ - u32 nodesize; - u32 sectorsize; - /* ilog2 of sectorsize, use to avoid 64bit division */ - u32 sectorsize_bits; - u32 csum_size; - u32 csums_per_leaf; - u32 stripesize; - - /* - * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular - * filesystem, on zoned it depends on the device constraints. - */ - u64 max_extent_size; - - /* Block groups and devices containing active swapfiles. */ - spinlock_t swapfile_pins_lock; - struct rb_root swapfile_pins; - - struct crypto_shash *csum_shash; - - /* Type of exclusive operation running, protected by super_lock */ - enum btrfs_exclusive_operation exclusive_operation; - - /* - * Zone size > 0 when in ZONED mode, otherwise it's used for a check - * if the mode is enabled - */ - u64 zone_size; - - /* Max size to emit ZONE_APPEND write command */ - u64 max_zone_append_size; - struct mutex zoned_meta_io_lock; - spinlock_t treelog_bg_lock; - u64 treelog_bg; - - /* - * Start of the dedicated data relocation block group, protected by - * relocation_bg_lock. - */ - spinlock_t relocation_bg_lock; - u64 data_reloc_bg; - struct mutex zoned_data_reloc_io_lock; - - u64 nr_global_roots; - - spinlock_t zone_active_bgs_lock; - struct list_head zone_active_bgs; - - /* Updates are not protected by any lock */ - struct btrfs_commit_stats commit_stats; - - /* - * Last generation where we dropped a non-relocation root. - * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() - * to change it and to read it, respectively. - */ - u64 last_root_drop_gen; - - /* - * Annotations for transaction events (structures are empty when - * compiled without lockdep). - */ - struct lockdep_map btrfs_trans_num_writers_map; - struct lockdep_map btrfs_trans_num_extwriters_map; - struct lockdep_map btrfs_state_change_map[4]; - struct lockdep_map btrfs_trans_pending_ordered_map; - struct lockdep_map btrfs_ordered_extent_map; - -#ifdef CONFIG_BTRFS_FS_REF_VERIFY - spinlock_t ref_verify_lock; - struct rb_root block_tree; -#endif - -#ifdef CONFIG_BTRFS_DEBUG - struct kobject *debug_kobj; - struct list_head allocated_roots; - - spinlock_t eb_leak_lock; - struct list_head allocated_ebs; -#endif -}; - -static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, - u64 gen) -{ - WRITE_ONCE(fs_info->last_root_drop_gen, gen); -} - -static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) -{ - return READ_ONCE(fs_info->last_root_drop_gen); -} - -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* - * Take the number of bytes to be checksummed and figure out how many leaves - * it would require to store the csums for that many bytes. - */ -static inline u64 btrfs_csum_bytes_to_leaves( - const struct btrfs_fs_info *fs_info, u64 csum_bytes) -{ - const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; - - return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); -} - -/* - * Use this if we would be adding new items, as we could split nodes as we cow - * down the tree. - */ -static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; -} - -/* - * Doing a truncate or a modification won't result in new nodes or leaves, just - * what we need for COW. - */ -static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, - unsigned num_items) -{ - return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; -} - -#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ - sizeof(struct btrfs_item)) - -static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) -{ - return fs_info->zone_size > 0; -} - -/* - * Count how many fs_info->max_extent_size cover the @size - */ -static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) -{ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (!fs_info) - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -#endif - - return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); -} - -bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation type); -void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); -void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); -void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, - enum btrfs_exclusive_operation op); - /* * The state of btrfs root */ diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index af356feec0c7..d8c931014c2a 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,24 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#define BTRFS_MAX_EXTENT_SIZE SZ_128M + +#define BTRFS_OLDEST_GENERATION 0ULL + +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M + +#define BTRFS_SUPER_INFO_OFFSET SZ_64K +#define BTRFS_SUPER_INFO_SIZE 4096 +static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); + +/* + * The reserved space at the beginning of each device. It covers the primary + * super block and leaves space for potential use by other tools like + * bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) /* * Runtime (in-memory) states of filesystem */ @@ -200,6 +218,648 @@ enum { #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) +struct btrfs_dev_replace { + /* See #define above */ + u64 replace_state; + /* Seconds since 1-Jan-1970 */ + time64_t time_started; + /* Seconds since 1-Jan-1970 */ + time64_t time_stopped; + atomic64_t num_write_errors; + atomic64_t num_uncorrectable_read_errors; + + u64 cursor_left; + u64 committed_cursor_left; + u64 cursor_left_last_write_of_item; + u64 cursor_right; + + /* See #define above */ + u64 cont_reading_from_srcdev_mode; + + int is_valid; + int item_needs_writeback; + struct btrfs_device *srcdev; + struct btrfs_device *tgtdev; + + struct mutex lock_finishing_cancel_unmount; + struct rw_semaphore rwsem; + + struct btrfs_scrub_progress scrub_progress; + + struct percpu_counter bio_counter; + wait_queue_head_t replace_wait; +}; + +/* + * Free clusters are used to claim free space in relatively large chunks, + * allowing us to do less seeky writes. They are used for all metadata + * allocations. In ssd_spread mode they are also used for data allocations. + */ +struct btrfs_free_cluster { + spinlock_t lock; + spinlock_t refill_lock; + struct rb_root root; + + /* Largest extent in this cluster */ + u64 max_size; + + /* First extent starting offset */ + u64 window_start; + + /* We did a full search and couldn't create a cluster */ + bool fragmented; + + struct btrfs_block_group *block_group; + /* + * When a cluster is allocated from a block group, we put the cluster + * onto a list in the block group so that it can be freed before the + * block group is freed. + */ + struct list_head block_group_list; +}; + +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + u64 prev_discard_time; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + u64 delay_ms; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE_PAUSED, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + +struct btrfs_fs_info { + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + unsigned long flags; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *quota_root; + struct btrfs_root *uuid_root; + struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; + + /* The log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + + /* The tree that holds the global roots (csum, extent, etc) */ + rwlock_t global_root_lock; + struct rb_root global_root_tree; + + spinlock_t fs_roots_radix_lock; + struct radix_tree_root fs_roots_radix; + + /* Block group cache stuff */ + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; + + /* Keep track of unallocated space */ + atomic64_t free_chunk_space; + + /* Track ranges which are used by log trees blocks/logged data extents */ + struct extent_io_tree excluded_extents; + + /* logical->physical extent mapping */ + struct extent_map_tree mapping_tree; + + /* + * Block reservation for extent, checksum, root tree and delayed dir + * index item. + */ + struct btrfs_block_rsv global_block_rsv; + /* Block reservation for metadata operations */ + struct btrfs_block_rsv trans_block_rsv; + /* Block reservation for chunk tree */ + struct btrfs_block_rsv chunk_block_rsv; + /* Block reservation for delayed operations */ + struct btrfs_block_rsv delayed_block_rsv; + /* Block reservation for delayed refs */ + struct btrfs_block_rsv delayed_refs_rsv; + + struct btrfs_block_rsv empty_block_rsv; + + u64 generation; + u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; + u64 avg_delayed_ref_runtime; + + /* + * This is updated to the current trans every time a full commit is + * required instead of the faster short fsync log commits + */ + u64 last_trans_log_full_commit; + unsigned long mount_opt; + + unsigned long compress_type:4; + unsigned int compress_level; + u32 commit_interval; + /* + * It is a suggestive number, the read side is safe even it gets a + * wrong number because we will write out the data into a regular + * extent. The write side(mount/remount) is under ->s_umount lock, + * so it is also safe. + */ + u64 max_inline; + + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + wait_queue_head_t transaction_blocked_wait; + wait_queue_head_t async_submit_wait; + + /* + * Used to protect the incompat_flags, compat_flags, compat_ro_flags + * when they are updated. + * + * Because we do not clear the flags for ever, so we needn't use + * the lock on the read side. + * + * We also needn't use the lock when we mount the fs, because + * there is no other task which will update the flag. + */ + spinlock_t super_lock; + struct btrfs_super_block *super_copy; + struct btrfs_super_block *super_for_commit; + struct super_block *sb; + struct inode *btree_inode; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex chunk_mutex; + + /* + * This is taken to make sure we don't set block groups ro after the + * free space cache has been allocated on them. + */ + struct mutex ro_block_group_mutex; + + /* + * This is used during read/modify/write to make sure no two ios are + * trying to mod the same stripe at the same time. + */ + struct btrfs_stripe_hash_table *stripe_hash_table; + + /* + * This protects the ordered operations list only while we are + * processing all of the entries on it. This way we make sure the + * commit code doesn't find the list temporarily empty because another + * function happens to be doing non-waiting preflush before jumping + * into the main commit. + */ + struct mutex ordered_operations_mutex; + + struct rw_semaphore commit_root_sem; + + struct rw_semaphore cleanup_work_sem; + + struct rw_semaphore subvol_sem; + + spinlock_t trans_lock; + /* + * The reloc mutex goes with the trans lock, it is taken during commit + * to protect us from the relocation code. + */ + struct mutex reloc_mutex; + + struct list_head trans_list; + struct list_head dead_roots; + struct list_head caching_block_groups; + + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_delayed_iputs; + wait_queue_head_t delayed_iputs_wait; + + atomic64_t tree_mod_seq; + + /* This protects tree_mod_log and tree_mod_seq_list */ + rwlock_t tree_mod_log_lock; + struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; + + atomic_t async_delalloc_pages; + + /* This is used to protect the following list -- ordered_roots. */ + spinlock_t ordered_root_lock; + + /* + * All fs/file tree roots in which there are data=ordered extents + * pending writeback are added into this list. + * + * These can span multiple transactions and basically include every + * dirty data page that isn't from nodatacow. + */ + struct list_head ordered_roots; + + struct mutex delalloc_root_mutex; + spinlock_t delalloc_root_lock; + /* All fs/file tree roots that have delalloc inodes. */ + struct list_head delalloc_roots; + + /* + * There is a pool of worker threads for checksumming during writes and + * a pool for checksumming after reads. This is because readers can + * run with FS locks held, and the writers may be waiting for those + * locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other two. + */ + struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; + struct btrfs_workqueue *delalloc_workers; + struct btrfs_workqueue *flush_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *endio_raid56_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; + struct btrfs_workqueue *endio_write_workers; + struct btrfs_workqueue *endio_freespace_worker; + struct btrfs_workqueue *caching_workers; + + /* + * Fixup workers take dirty pages that didn't properly go through the + * cow mechanism and make them safe to write. It happens for the + * sys_munmap function call path. + */ + struct btrfs_workqueue *fixup_workers; + struct btrfs_workqueue *delayed_workers; + + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + u32 thread_pool_size; + + struct kobject *space_info_kobj; + struct kobject *qgroups_kobj; + struct kobject *discard_kobj; + + /* Used to keep from writing metadata until there is a nice batch */ + struct percpu_counter dirty_metadata_bytes; + struct percpu_counter delalloc_bytes; + struct percpu_counter ordered_bytes; + s32 dirty_metadata_batch; + s32 delalloc_batch; + + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + + /* + * The space_info list is effectively read only after initial setup. + * It is populated at mount time and cleaned up after all block groups + * are removed. RCU is used to protect it. + */ + struct list_head space_info; + + struct btrfs_space_info *data_sinfo; + + struct reloc_control *reloc_ctl; + + /* data_alloc_cluster is only used in ssd_spread mode */ + struct btrfs_free_cluster data_alloc_cluster; + + /* All metadata allocations go through this cluster. */ + struct btrfs_free_cluster meta_alloc_cluster; + + /* Auto defrag inodes go here. */ + spinlock_t defrag_inodes_lock; + struct rb_root defrag_inodes; + atomic_t defrag_running; + + /* Used to protect avail_{data, metadata, system}_alloc_bits */ + seqlock_t profiles_lock; + /* + * These three are in extended format (availability of single chunks is + * denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted + * by corresponding BTRFS_BLOCK_GROUP_* bits) + */ + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + + /* Balance state */ + spinlock_t balance_lock; + struct mutex balance_mutex; + atomic_t balance_pause_req; + atomic_t balance_cancel_req; + struct btrfs_balance_control *balance_ctl; + wait_queue_head_t balance_wait_q; + + /* Cancellation requests for chunk relocation */ + atomic_t reloc_cancel_req; + + u32 data_chunk_allocations; + u32 metadata_ratio; + + void *bdev_holder; + + /* Private scrub information */ + struct mutex scrub_lock; + atomic_t scrubs_running; + atomic_t scrub_pause_req; + atomic_t scrubs_paused; + atomic_t scrub_cancel_req; + wait_queue_head_t scrub_pause_wait; + /* + * The worker pointers are NULL iff the refcount is 0, ie. scrub is not + * running. + */ + refcount_t scrub_workers_refcnt; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; + struct btrfs_subpage_info *subpage_info; + + struct btrfs_discard_ctl discard_ctl; + +#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY + u32 check_integrity_print_mask; +#endif + /* Is qgroup tracking in a consistent state? */ + u64 qgroup_flags; + + /* Holds configuration and tracking. Protected by qgroup_lock. */ + struct rb_root qgroup_tree; + spinlock_t qgroup_lock; + + /* + * Used to avoid frequently calling ulist_alloc()/ulist_free() + * when doing qgroup accounting, it must be protected by qgroup_lock. + */ + struct ulist *qgroup_ulist; + + /* + * Protect user change for quota operations. If a transaction is needed, + * it must be started before locking this lock. + */ + struct mutex qgroup_ioctl_lock; + + /* List of dirty qgroups to be written at next commit. */ + struct list_head dirty_qgroups; + + /* Used by qgroup for an efficient tree traversal. */ + u64 qgroup_seq; + + /* Qgroup rescan items. */ + /* Protects the progress item */ + struct mutex qgroup_rescan_lock; + struct btrfs_key qgroup_rescan_progress; + struct btrfs_workqueue *qgroup_rescan_workers; + struct completion qgroup_rescan_completion; + struct btrfs_work qgroup_rescan_work; + /* Protected by qgroup_rescan_lock */ + bool qgroup_rescan_running; + u8 qgroup_drop_subtree_thres; + + /* Filesystem state */ + unsigned long fs_state; + + struct btrfs_delayed_root *delayed_root; + + /* Extent buffer radix tree */ + spinlock_t buffer_lock; + /* Entries are eb->start / sectorsize */ + struct radix_tree_root buffer_radix; + + /* Next backup root to be overwritten */ + int backup_root_index; + + /* Device replace state */ + struct btrfs_dev_replace dev_replace; + + struct semaphore uuid_tree_rescan_sem; + + /* Used to reclaim the metadata space in the background. */ + struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; + struct work_struct preempt_reclaim_work; + + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + struct mutex unused_bg_unpin_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; + + /* Cached block sizes */ + u32 nodesize; + u32 sectorsize; + /* ilog2 of sectorsize, use to avoid 64bit division */ + u32 sectorsize_bits; + u32 csum_size; + u32 csums_per_leaf; + u32 stripesize; + + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + + /* Block groups and devices containing active swapfiles. */ + spinlock_t swapfile_pins_lock; + struct rb_root swapfile_pins; + + struct crypto_shash *csum_shash; + + /* Type of exclusive operation running, protected by super_lock */ + enum btrfs_exclusive_operation exclusive_operation; + + /* + * Zone size > 0 when in ZONED mode, otherwise it's used for a check + * if the mode is enabled + */ + u64 zone_size; + + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; + struct mutex zoned_meta_io_lock; + spinlock_t treelog_bg_lock; + u64 treelog_bg; + + /* + * Start of the dedicated data relocation block group, protected by + * relocation_bg_lock. + */ + spinlock_t relocation_bg_lock; + u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; + + spinlock_t zone_active_bgs_lock; + struct list_head zone_active_bgs; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; + + /* + * Last generation where we dropped a non-relocation root. + * Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen() + * to change it and to read it, respectively. + */ + u64 last_root_drop_gen; + + /* + * Annotations for transaction events (structures are empty when + * compiled without lockdep). + */ + struct lockdep_map btrfs_trans_num_writers_map; + struct lockdep_map btrfs_trans_num_extwriters_map; + struct lockdep_map btrfs_state_change_map[4]; + struct lockdep_map btrfs_trans_pending_ordered_map; + struct lockdep_map btrfs_ordered_extent_map; + +#ifdef CONFIG_BTRFS_FS_REF_VERIFY + spinlock_t ref_verify_lock; + struct rb_root block_tree; +#endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct list_head allocated_roots; + + spinlock_t eb_leak_lock; + struct list_head allocated_ebs; +#endif +}; + +static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info, + u64 gen) +{ + WRITE_ONCE(fs_info->last_root_drop_gen, gen); +} + +static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info) +{ + return READ_ONCE(fs_info->last_root_drop_gen); +} + +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +/* + * Take the number of bytes to be checksummed and figure out how many leaves + * it would require to store the csums for that many bytes. + */ +static inline u64 btrfs_csum_bytes_to_leaves( + const struct btrfs_fs_info *fs_info, u64 csum_bytes) +{ + const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits; + + return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf); +} + +/* + * Use this if we would be adding new items, as we could split nodes as we cow + * down the tree. + */ +static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items; +} + +/* + * Doing a truncate or a modification won't result in new nodes or leaves, just + * what we need for COW. + */ +static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info, + unsigned num_items) +{ + return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items; +} + +#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ + sizeof(struct btrfs_item)) + +static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) +{ + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); +} + +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); +void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation op); + /* Compatibility and incompatibility defines */ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, const char *name); From eb33a4d65b2adae52530d6c3c7a9f58bf734a750 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:53 -0400 Subject: [PATCH 087/249] btrfs: move the lockdep helpers into locking.h These more naturally fit in with the locking related code, and they're all defines so they can easily go anywhere, move them out of ctree.h into locking.h Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 76 ---------------------------------------------- fs/btrfs/locking.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 76 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ace6f41612e5..970a22a04c94 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -181,82 +181,6 @@ enum { BTRFS_ROOT_RESET_LOCKDEP_CLASS, }; -enum btrfs_lockdep_trans_states { - BTRFS_LOCKDEP_TRANS_COMMIT_START, - BTRFS_LOCKDEP_TRANS_UNBLOCKED, - BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, - BTRFS_LOCKDEP_TRANS_COMPLETED, -}; - -/* - * Lockdep annotation for wait events. - * - * @owner: The struct where the lockdep map is defined - * @lock: The lockdep map corresponding to a wait event - * - * This macro is used to annotate a wait event. In this case a thread acquires - * the lockdep map as writer (exclusive lock) because it has to block until all - * the threads that hold the lock as readers signal the condition for the wait - * event and release their locks. - */ -#define btrfs_might_wait_for_event(owner, lock) \ - do { \ - rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ - rwsem_release(&owner->lock##_map, _THIS_IP_); \ - } while (0) - -/* - * Protection for the resource/condition of a wait event. - * - * @owner: The struct where the lockdep map is defined - * @lock: The lockdep map corresponding to a wait event - * - * Many threads can modify the condition for the wait event at the same time - * and signal the threads that block on the wait event. The threads that modify - * the condition and do the signaling acquire the lock as readers (shared - * lock). - */ -#define btrfs_lockdep_acquire(owner, lock) \ - rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) - -/* - * Used after signaling the condition for a wait event to release the lockdep - * map held by a reader thread. - */ -#define btrfs_lockdep_release(owner, lock) \ - rwsem_release(&owner->lock##_map, _THIS_IP_) - -/* - * Macros for the transaction states wait events, similar to the generic wait - * event macros. - */ -#define btrfs_might_wait_for_state(owner, i) \ - do { \ - rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ - rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ - } while (0) - -#define btrfs_trans_state_lockdep_acquire(owner, i) \ - rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) - -#define btrfs_trans_state_lockdep_release(owner, i) \ - rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) - -/* Initialization of the lockdep map */ -#define btrfs_lockdep_init_map(owner, lock) \ - do { \ - static struct lock_class_key lock##_key; \ - lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ - } while (0) - -/* Initialization of the transaction states lockdep maps. */ -#define btrfs_state_lockdep_init_map(owner, lock, state) \ - do { \ - static struct lock_class_key lock##_key; \ - lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ - &lock##_key, 0); \ - } while (0) - /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 490c7a79e995..11c2269b4b6f 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -78,6 +78,82 @@ enum btrfs_lock_nesting { BTRFS_NESTING_MAX, }; +enum btrfs_lockdep_trans_states { + BTRFS_LOCKDEP_TRANS_COMMIT_START, + BTRFS_LOCKDEP_TRANS_UNBLOCKED, + BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED, + BTRFS_LOCKDEP_TRANS_COMPLETED, +}; + +/* + * Lockdep annotation for wait events. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * This macro is used to annotate a wait event. In this case a thread acquires + * the lockdep map as writer (exclusive lock) because it has to block until all + * the threads that hold the lock as readers signal the condition for the wait + * event and release their locks. + */ +#define btrfs_might_wait_for_event(owner, lock) \ + do { \ + rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \ + rwsem_release(&owner->lock##_map, _THIS_IP_); \ + } while (0) + +/* + * Protection for the resource/condition of a wait event. + * + * @owner: The struct where the lockdep map is defined + * @lock: The lockdep map corresponding to a wait event + * + * Many threads can modify the condition for the wait event at the same time + * and signal the threads that block on the wait event. The threads that modify + * the condition and do the signaling acquire the lock as readers (shared + * lock). + */ +#define btrfs_lockdep_acquire(owner, lock) \ + rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_) + +/* + * Used after signaling the condition for a wait event to release the lockdep + * map held by a reader thread. + */ +#define btrfs_lockdep_release(owner, lock) \ + rwsem_release(&owner->lock##_map, _THIS_IP_) + +/* + * Macros for the transaction states wait events, similar to the generic wait + * event macros. + */ +#define btrfs_might_wait_for_state(owner, i) \ + do { \ + rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \ + } while (0) + +#define btrfs_trans_state_lockdep_acquire(owner, i) \ + rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_) + +#define btrfs_trans_state_lockdep_release(owner, i) \ + rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_) + +/* Initialization of the lockdep map */ +#define btrfs_lockdep_init_map(owner, lock) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \ + } while (0) + +/* Initialization of the transaction states lockdep maps. */ +#define btrfs_state_lockdep_init_map(owner, lock, state) \ + do { \ + static struct lock_class_key lock##_key; \ + lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \ + &lock##_key, 0); \ + } while (0) + static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, "too many lock subclasses defined"); From 13d925c1c26916c017b5339625c66bd20df9fd96 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:54 -0400 Subject: [PATCH 088/249] btrfs: minor whitespace in ctree.h We've accumulated some whitespace problems in ctree.h, clean these up. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 970a22a04c94..bf94531f4486 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -435,10 +435,8 @@ struct btrfs_file_private { void *filldir_buf; }; - static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { - return info->nodesize - sizeof(struct btrfs_header); } From 8483d40242c56b0b225238efed1f73ef6985c5af Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:55 -0400 Subject: [PATCH 089/249] btrfs: remove extra space info prototypes in ctree.h These are defined already in space-info.h, remove them from ctree.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bf94531f4486..a430577cf8b7 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -567,8 +567,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -void btrfs_clear_space_info_full(struct btrfs_fs_info *info); - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); @@ -585,7 +583,6 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); -int btrfs_init_space_info(struct btrfs_fs_info *fs_info); int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_start_write_no_snapshotting(struct btrfs_root *root); From e2f13b343c147bc34691301c30d8a8b35d0a2e5c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:56 -0400 Subject: [PATCH 090/249] btrfs: move btrfs_account_ro_block_groups_free_space into space-info.c This was prototyped in ctree.h and the code existed in extent-tree.c, but it's space-info related so move it into space-info.c. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 - fs/btrfs/extent-tree.c | 34 ---------------------------------- fs/btrfs/space-info.c | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 1 + 4 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a430577cf8b7..07f876961da1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -576,7 +576,6 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush); -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 8d28ba360e9a..b97c99a8dd4c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5975,40 +5975,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return ret; } -/* - * helper to account the unused space of all the readonly block group in the - * space_info. takes mirrors into account. - */ -u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) -{ - struct btrfs_block_group *block_group; - u64 free_bytes = 0; - int factor; - - /* It's df, we don't care if it's racy */ - if (list_empty(&sinfo->ro_bgs)) - return 0; - - spin_lock(&sinfo->lock); - list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { - spin_lock(&block_group->lock); - - if (!block_group->ro) { - spin_unlock(&block_group->lock); - continue; - } - - factor = btrfs_bg_type_to_factor(block_group->flags); - free_bytes += (block_group->length - - block_group->used) * factor; - - spin_unlock(&block_group->lock); - } - spin_unlock(&sinfo->lock); - - return free_bytes; -} - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end) { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 94d73485454a..45404798ee8c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1814,3 +1814,37 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info) } dump_global_block_rsv(fs_info); } + +/* + * Account the unused space of all the readonly block group in the space_info. + * takes mirrors into account. + */ +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo) +{ + struct btrfs_block_group *block_group; + u64 free_bytes = 0; + int factor; + + /* It's df, we don't care if it's racy */ + if (list_empty(&sinfo->ro_bgs)) + return 0; + + spin_lock(&sinfo->lock); + list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) { + spin_lock(&block_group->lock); + + if (!block_group->ro) { + spin_unlock(&block_group->lock); + continue; + } + + factor = btrfs_bg_type_to_factor(block_group->flags); + free_bytes += (block_group->length - + block_group->used) * factor; + + spin_unlock(&block_group->lock); + } + spin_unlock(&sinfo->lock); + + return free_bytes; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index f28bd2c051d4..fc99ea2b0c34 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -236,5 +236,6 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); +u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); #endif /* BTRFS_SPACE_INFO_H */ From a0231804affe78d27264811559ee31bd341c2bff Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:57 -0400 Subject: [PATCH 091/249] btrfs: move extent-tree helpers into their own header file Move all the extent tree related prototypes to extent-tree.h out of ctree.h, and then go include it everywhere needed so everything compiles. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 + fs/btrfs/block-group.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 72 -------------------------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent-tree.h | 72 ++++++++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/space-info.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-log.c | 1 + 17 files changed, 87 insertions(+), 72 deletions(-) create mode 100644 fs/btrfs/extent-tree.h diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f556566b9c79..173df40da984 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -17,6 +17,7 @@ #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index d1f8d792b184..5a743c4d4e97 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -19,6 +19,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #ifdef CONFIG_BTRFS_DEBUG int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 1283ca46c5bc..f0ddb44704ee 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -20,6 +20,7 @@ #include "tree-checker.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 07f876961da1..8ca67227dec0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -495,78 +495,6 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -/* extent-tree.c */ - -enum btrfs_inline_ref_type { - BTRFS_REF_TYPE_INVALID, - BTRFS_REF_TYPE_BLOCK, - BTRFS_REF_TYPE_DATA, - BTRFS_REF_TYPE_ANY, -}; - -int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, - struct btrfs_extent_inline_ref *iref, - enum btrfs_inline_ref_type is_data); -u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); - - -int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 num_bytes); -void btrfs_free_excluded_extents(struct btrfs_block_group *cache); -int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, - unsigned long count); -void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head); -int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); -int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); -int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, - int reserved); -int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, - u64 bytenr, u64 num_bytes); -int btrfs_exclude_logged_extents(struct extent_buffer *eb); -int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict, - struct btrfs_path *path); -struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 parent, u64 root_objectid, - const struct btrfs_disk_key *key, - int level, u64 hint, - u64 empty_size, - enum btrfs_lock_nesting nest); -void btrfs_free_tree_block(struct btrfs_trans_handle *trans, - u64 root_id, - struct extent_buffer *buf, - u64 parent, int last_ref); -int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 owner, - u64 offset, u64 ram_bytes, - struct btrfs_key *ins); -int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, - u64 root_objectid, u64 owner, u64 offset, - struct btrfs_key *ins); -int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, - u64 min_alloc_size, u64 empty_size, u64 hint_byte, - struct btrfs_key *ins, int is_data, int delalloc); -int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct extent_buffer *buf, int full_backref); -int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, int level); -int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); - -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); -int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, - u64 len); -int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); -int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, - struct btrfs_ref *generic_ref); - int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 75475a213972..005da95c9c17 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -45,6 +45,7 @@ #include "subpage.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b97c99a8dd4c..d2d5de946242 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -38,6 +38,7 @@ #include "dev-replace.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000000..b3674b008d58 --- /dev/null +++ b/fs/btrfs/extent-tree.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_EXTENT_TREE_H +#define BTRFS_EXTENT_TREE_H + +enum btrfs_inline_ref_type { + BTRFS_REF_TYPE_INVALID, + BTRFS_REF_TYPE_BLOCK, + BTRFS_REF_TYPE_DATA, + BTRFS_REF_TYPE_ANY, +}; + +int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, + struct btrfs_extent_inline_ref *iref, + enum btrfs_inline_ref_type is_data); +u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); + +int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 num_bytes); +void btrfs_free_excluded_extents(struct btrfs_block_group *cache); +int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count); +void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_delayed_ref_head *head); +int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); +int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 offset, int metadata, u64 *refs, u64 *flags); +int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, + int reserved); +int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes); +int btrfs_exclude_logged_extents(struct extent_buffer *eb); +int btrfs_cross_ref_exist(struct btrfs_root *root, + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 parent, u64 root_objectid, + const struct btrfs_disk_key *key, + int level, u64 hint, + u64 empty_size, + enum btrfs_lock_nesting nest); +void btrfs_free_tree_block(struct btrfs_trans_handle *trans, + u64 root_id, + struct extent_buffer *buf, + u64 parent, int last_ref); +int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 owner, + u64 offset, u64 ram_bytes, + struct btrfs_key *ins); +int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, + u64 root_objectid, u64 owner, u64 offset, + struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, + u64 min_alloc_size, u64 empty_size, u64 hint_byte, + struct btrfs_key *ins, int is_data, int delalloc); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, int full_backref); +int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, + struct extent_buffer *eb, u64 flags, int level); +int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); + +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc); +int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 62360a554420..bad676d2e23f 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -32,6 +32,7 @@ #include "subpage.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static struct kmem_cache *btrfs_inode_defrag_cachep; /* diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index fc79d21e7b4f..a652e5dc465b 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -14,6 +14,7 @@ #include "block-group.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index d66bdbae5585..724507ce7a7d 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -12,6 +12,7 @@ #include "print-tree.h" #include "space-info.h" #include "accessors.h" +#include "extent-tree.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 78867a084428..cb2a35b19c75 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -57,6 +57,7 @@ #include "inode-item.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2132e3e15986..1d4bdb22881c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -52,6 +52,7 @@ #include "subpage.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 4786b59035a8..88b248b22810 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -26,6 +26,7 @@ #include "tree-mod-log.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 77d03dce2521..946190f13138 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -30,6 +30,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* * Relocation overview diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 45404798ee8c..e5f9f43ffa4c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -12,6 +12,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" /* * HOW DOES SPACE RESERVATION WORK diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0d44d50dc3be..a04c2a9708e2 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -26,6 +26,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a5e56a678af2..bd9a5eda9def 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -23,6 +23,7 @@ #include "inode-item.h" #include "fs.h" #include "accessors.h" +#include "extent-tree.h" #define MAX_CONFLICT_INODES 10 From 2839c2c142dd4baae146b7d3e42ead8fe5d36014 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:58 -0400 Subject: [PATCH 092/249] btrfs: move delalloc space related prototypes to delalloc-space.h These exist in delalloc-space.c, move them from ctree.h into delalloc-space.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- fs/btrfs/delalloc-space.h | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8ca67227dec0..afb2bb3acc76 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -500,10 +500,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, int nitems, bool use_global_rsv); void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); -void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes, bool noflush); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index e07d46043455..c5d573f2366e 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -20,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); +void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); #endif /* BTRFS_DELALLOC_SPACE_H */ From 6d2049a2f36f873b8f793fde7ea1874d0d17a3dc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:46:59 -0400 Subject: [PATCH 093/249] btrfs: delete unused function prototypes in ctree.h This batch of prototypes no longer have code associated with them, so remove them. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index afb2bb3acc76..acae38e378a6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -507,12 +507,6 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, u64 *actual_bytes); int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range); -int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -int btrfs_start_write_no_snapshotting(struct btrfs_root *root); -void btrfs_end_write_no_snapshotting(struct btrfs_root *root); -void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); - /* ctree.c */ int __init btrfs_ctree_init(void); void __cold btrfs_ctree_exit(void); From 45c40c8f9541f336c7857f09ea843fc8fa980b65 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 14:47:00 -0400 Subject: [PATCH 094/249] btrfs: move root tree prototypes to their own header Move all the root-tree.c prototypes to root-tree.h, and then update all the necessary files to include the new header. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 32 -------------------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/free-space-tree.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/root-tree.c | 1 + fs/btrfs/root-tree.h | 34 ++++++++++++++++++++++++++++++++++ fs/btrfs/transaction.c | 1 + fs/btrfs/tree-log.c | 1 + 12 files changed, 44 insertions(+), 32 deletions(-) create mode 100644 fs/btrfs/root-tree.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index acae38e378a6..77f1f345d040 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -495,12 +495,6 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) return mapping_gfp_constraint(mapping, ~__GFP_FS); } -int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv, - int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_root *root, - struct btrfs_block_rsv *rsv); - int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, @@ -695,32 +689,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *node, struct extent_buffer *parent); -/* root-item.c */ -int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 sequence, - const struct fscrypt_str *name); -int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, - u64 ref_id, u64 dirid, u64 *sequence, - const struct fscrypt_str *name); -int btrfs_del_root(struct btrfs_trans_handle *trans, - const struct btrfs_key *key); -int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, - const struct btrfs_key *key, - struct btrfs_root_item *item); -int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_key *key, - struct btrfs_root_item *item); -int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, - struct btrfs_path *path, struct btrfs_root_item *root_item, - struct btrfs_key *root_key); -int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); -void btrfs_set_root_node(struct btrfs_root_item *item, - struct extent_buffer *node); -void btrfs_check_and_init_root_item(struct btrfs_root_item *item); -void btrfs_update_root_times(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - /* uuid-tree.c */ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 005da95c9c17..ebc9baab0e47 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -46,6 +46,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d2d5de946242..510be00da406 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -39,6 +39,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index a652e5dc465b..869d062d6765 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -15,6 +15,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" static int __add_block_group_free_space(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb2a35b19c75..ec9f0f4afd12 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -58,6 +58,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1d4bdb22881c..52a268e05d4a 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -53,6 +53,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 88b248b22810..75dd964ca46c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -27,6 +27,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 946190f13138..e345cc71da15 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -31,6 +31,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" /* * Relocation overview diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 6aab98114253..42f046e5e25f 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -14,6 +14,7 @@ #include "qgroup.h" #include "space-info.h" #include "accessors.h" +#include "root-tree.h" /* * Read a root item from the tree. In case we detect a root item smaller then diff --git a/fs/btrfs/root-tree.h b/fs/btrfs/root-tree.h new file mode 100644 index 000000000000..cbbaca32126e --- /dev/null +++ b/fs/btrfs/root-tree.h @@ -0,0 +1,34 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ROOT_TREE_H +#define BTRFS_ROOT_TREE_H + +int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv, + int nitems, bool use_global_rsv); +void btrfs_subvolume_release_metadata(struct btrfs_root *root, + struct btrfs_block_rsv *rsv); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 sequence, + const struct fscrypt_str *name); +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id, + u64 ref_id, u64 dirid, u64 *sequence, + const struct fscrypt_str *name); +int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + const struct btrfs_key *key, + struct btrfs_root_item *item); +int __must_check btrfs_update_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_root_item *item); +int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key, + struct btrfs_path *path, struct btrfs_root_item *root_item, + struct btrfs_key *root_key); +int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info); +void btrfs_set_root_node(struct btrfs_root_item *item, + struct extent_buffer *node); +void btrfs_check_and_init_root_item(struct btrfs_root_item *item); +void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a04c2a9708e2..82b2e2ec90cf 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -27,6 +27,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bd9a5eda9def..f185cd286243 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -24,6 +24,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "root-tree.h" #define MAX_CONFLICT_INODES 10 From 911bd75aca7390ca9e39191450829b09772c5a77 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 24 Oct 2022 16:16:14 -0400 Subject: [PATCH 095/249] btrfs: remove unused function prototypes I wrote the following coccinelle script to find function declarations that didn't have the corresponding code for them @funcproto@ identifier func; type T; position p0; @@ T func@p0(...); @funccode@ identifier funcproto.func; position p1; @@ func@p1(...) { ... } @script:python depends on !funccode@ p0 << funcproto.p0; @@ print("Proto with no function at %s:%s" % (p0[0].file, p0[0].line)) and ran it against btrfs, which identified the 4 function prototypes I've removed in this patch. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 --- fs/btrfs/disk-io.h | 4 ---- 2 files changed, 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 77f1f345d040..09005bd9bfef 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -739,7 +739,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ int btrfs_del_csums(struct btrfs_trans_handle *trans, @@ -924,8 +923,6 @@ int __pure btrfs_is_empty_uuid(u8 *uuid); int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); -void btrfs_get_block_group_info(struct list_head *groups_list, - struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index da040ec327ae..6edc66b4b4d3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -116,8 +116,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); -blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, - int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, @@ -130,8 +128,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); -int btree_lock_page_hook(struct page *page, void *data, - void (*flush_fn)(void *)); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); From 1751850fbd770c89347e14ae9590219bcd2f0649 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Oct 2022 11:13:06 -0400 Subject: [PATCH 096/249] btrfs: remove unused btrfs_cond_migrate_bytes The last user of this was removed in 7f9fe6144076 ("btrfs: improve global reserve stealing logic"), drop this code as it's no longer called by anybody. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.c | 25 ------------------------- fs/btrfs/block-rsv.h | 3 --- 2 files changed, 28 deletions(-) diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 069ea9d6a8d4..29705256727f 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -325,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, spin_unlock(&block_rsv->lock); } -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor) -{ - struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; - u64 min_bytes; - - if (global_rsv->space_info != dest->space_info) - return -ENOSPC; - - spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, min_factor); - if (global_rsv->reserved < min_bytes + num_bytes) { - spin_unlock(&global_rsv->lock); - return -ENOSPC; - } - global_rsv->reserved -= num_bytes; - if (global_rsv->reserved < global_rsv->size) - global_rsv->full = false; - spin_unlock(&global_rsv->lock); - - btrfs_block_rsv_add_bytes(dest, num_bytes, true); - return 0; -} - void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) { struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 578c3497a455..7e9016a9e193 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -70,9 +70,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv, struct btrfs_block_rsv *dst_rsv, u64 num_bytes, bool update_size); int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes); -int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_block_rsv *dest, u64 num_bytes, - int min_factor); void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes, bool update_size); u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info, From 43dd529abed2bcf1467f13cce83da1c8456587ea Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 14:21:42 +0200 Subject: [PATCH 097/249] btrfs: update function comments Update, reformat or reword function comments. This also removes the kdoc marker so we don't get reports when the function name is missing. Changes made: - remove kdoc markers - reformat the brief description to be a proper sentence - reword to imperative voice - align parameter list - fix typos Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 20 +++---- fs/btrfs/ctree.c | 11 ++-- fs/btrfs/delalloc-space.c | 59 +++++++++---------- fs/btrfs/delayed-ref.c | 19 +++--- fs/btrfs/discard.c | 111 ++++++++++++++++++++---------------- fs/btrfs/extent_io.c | 59 ++++++++++--------- fs/btrfs/extent_map.c | 65 ++++++++++----------- fs/btrfs/file-item.c | 50 ++++++++-------- fs/btrfs/free-space-cache.c | 9 +-- fs/btrfs/inode.c | 14 ++--- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 13 +++-- fs/btrfs/raid56.c | 4 +- fs/btrfs/reflink.c | 18 +++--- fs/btrfs/space-info.c | 16 +++--- fs/btrfs/tree-log.c | 2 +- fs/btrfs/ulist.c | 35 +++++++----- fs/btrfs/volumes.c | 43 +++++++------- fs/btrfs/zoned.c | 6 +- fs/btrfs/zstd.c | 2 +- 20 files changed, 289 insertions(+), 269 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5a743c4d4e97..cfe495d39ebd 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -299,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group( return cache; } -/** +/* * Check if we can do a NOCOW write for a given extent. * * @fs_info: The filesystem information object. @@ -340,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, return bg; } -/** +/* * Decrement the number of NOCOW writers in a block group. * - * @bg: The block group. - * * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), * and on the block group returned by that call. Typically this is called after * creating an ordered extent for a NOCOW write, to prevent races with scrub and @@ -1813,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } -/** - * Map a physical disk address to a list of logical addresses +/* + * Map a physical disk address to a list of logical addresses. * * @fs_info: the filesystem * @chunk_start: logical address of block group @@ -3421,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, return ret; } -/** - * btrfs_add_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @ram_bytes: The number of bytes of file content, and will be same to * @num_bytes except for the compress path. @@ -3465,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, return ret; } -/** - * btrfs_free_reserved_bytes - update the block_group and space info counters +/* + * Update the block_group and space info counters. + * * @cache: The cache we are manipulating * @num_bytes: The number of bytes in question * @delalloc: The blocks are allocated for the delalloc write diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f0ddb44704ee..4b47d380da1e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2363,7 +2363,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } -/** +/* * Search for a valid slot for the given path. * * @root: The root node of the tree. @@ -3985,14 +3985,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/** - * setup_items_for_insert - Helper called before inserting one or more items - * to a leaf. Main purpose is to save stack depth by doing the bulk of the work - * in a function that doesn't call btrfs_search_slot +/* + * Make space in the node before inserting one or more items. * * @root: root we are inserting items to * @path: points to the leaf/slot where we are going to insert new items * @batch: information about the batch of items to insert + * + * Main purpose is to save stack depth by doing the bulk of the work in a + * function that doesn't call btrfs_search_slot */ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_item_batch *batch) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 605d8874a446..7ddb1d104e8e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -202,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, btrfs_qgroup_free_data(inode, reserved, start, len); } -/** - * Release any excessive reservation +/* + * Release any excessive reservations for an inode. * * @inode: the inode we need to release from * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup @@ -377,12 +377,12 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, return 0; } -/** - * Release a metadata reservation for an inode +/* + * Release a metadata reservation for an inode. * - * @inode: the inode to release the reservation for. - * @num_bytes: the number of bytes we are releasing. - * @qgroup_free: free qgroup reservation or convert it to per-trans reservation + * @inode: the inode to release the reservation for. + * @num_bytes: the number of bytes we are releasing. + * @qgroup_free: free qgroup reservation or convert it to per-trans reservation * * This will release the metadata reservation for an inode. This can be called * once we complete IO for a given set of bytes to release their metadata @@ -405,10 +405,11 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, btrfs_inode_rsv_release(inode, qgroup_free); } -/** - * btrfs_delalloc_release_extents - release our outstanding_extents - * @inode: the inode to balance the reservation for. - * @num_bytes: the number of bytes we originally reserved with +/* + * Release our outstanding_extents for an inode. + * + * @inode: the inode to balance the reservation for. + * @num_bytes: the number of bytes we originally reserved with * * When we reserve space we increase outstanding_extents for the extents we may * add. Once we've set the range as delalloc or created our ordered extents we @@ -433,30 +434,30 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } -/** - * btrfs_delalloc_reserve_space - reserve data and metadata space for - * delalloc - * @inode: inode we're writing to - * @start: start range we are writing to - * @len: how long the range we are writing to - * @reserved: mandatory parameter, record actually reserved qgroup ranges of - * current reservation. +/* + * Reserve data and metadata space for delalloc + * + * @inode: inode we're writing to + * @start: start range we are writing to + * @len: how long the range we are writing to + * @reserved: mandatory parameter, record actually reserved qgroup ranges of + * current reservation. * * This will do the following things * - * - reserve space in data space info for num bytes - * and reserve precious corresponding qgroup space + * - reserve space in data space info for num bytes and reserve precious + * corresponding qgroup space * (Done in check_data_free_space) * * - reserve space for metadata space, based on the number of outstanding - * extents and how much csums will be needed - * also reserve metadata space in a per root over-reserve method. + * extents and how much csums will be needed also reserve metadata space in a + * per root over-reserve method. * - add to the inodes->delalloc_bytes * - add it to the fs_info's delalloc inodes list. * (Above 3 all done in delalloc_reserve_metadata) * * Return 0 for success - * Return <0 for error(-ENOSPC or -EQUOT) + * Return <0 for error(-ENOSPC or -EDQUOT) */ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len) @@ -475,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, return ret; } -/** +/* * Release data and metadata space for delalloc * * @inode: inode we're releasing space for @@ -484,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, * @len: length of the space already reserved * @qgroup_free: should qgroup reserved-space also be freed * - * This function will release the metadata space that was not used and will - * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes - * list if there are no delalloc bytes left. - * Also it will handle the qgroup reserved space. + * Release the metadata space that was not used and will decrement + * ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if + * there are no delalloc bytes left. Also it will handle the qgroup reserved + * space. */ void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 010cc16297d8..573ebab886e2 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -71,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans) return btrfs_check_space_for_delayed_refs(trans->fs_info); } -/** - * Release a ref head's reservation +/* + * Release a ref head's reservation. * * @fs_info: the filesystem * @nr: number of items to drop * - * This drops the delayed ref head's count from the delayed refs rsv and frees - * any excess reservation we had. + * Drops the delayed ref head's count from the delayed refs rsv and free any + * excess reservation we had. */ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) { @@ -104,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr) } /* - * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv - * @trans - the trans that may have generated delayed refs + * Adjust the size of the delayed refs rsv. * * This is to be called anytime we may have adjusted trans->delayed_ref_updates, * it'll calculate the additional size and add it to the delayed_refs_rsv. @@ -139,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) trans->delayed_ref_updates = 0; } -/** - * Transfer bytes to our delayed refs rsv +/* + * Transfer bytes to our delayed refs rsv. * * @fs_info: the filesystem * @src: source block rsv to transfer from @@ -188,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, delayed_refs_rsv->space_info, to_free); } -/** - * Refill based on our delayed refs usage +/* + * Refill based on our delayed refs usage. * * @fs_info: the filesystem * @flush: control how we can flush for this reservation. diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 51f0ef386046..ff2e524d9937 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -62,7 +62,7 @@ #define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) #define BTRFS_DISCARD_MAX_IOPS (10U) -/* Montonically decreasing minimum length filters after index 0 */ +/* Monotonically decreasing minimum length filters after index 0 */ static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { 0, BTRFS_ASYNC_DISCARD_MAX_FILTER, @@ -147,10 +147,11 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, return running; } -/** - * find_next_block_group - find block_group that's up next for discarding - * @discard_ctl: discard control - * @now: current time +/* + * Find block_group that's up next for discarding. + * + * @discard_ctl: discard control + * @now: current time * * Iterate over the discard lists to find the next block_group up for * discarding checking the discard_eligible_time of block_group. @@ -185,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group( return ret_block_group; } -/** - * Wrap find_next_block_group() +/* + * Look up next block group and set it for use. * * @discard_ctl: discard control * @discard_state: the discard_state of the block_group after state management * @discard_index: the discard_index of the block_group after state management * @now: time when discard was invoked, in ns * - * This wraps find_next_block_group() and sets the block_group to be in use. - * discard_state's control flow is managed here. Variables related to - * discard_state are reset here as needed (eg discard_cursor). @discard_state + * Wrap find_next_block_group() and set the block_group to be in use. + * @discard_state's control flow is managed here. Variables related to + * @discard_state are reset here as needed (eg. @discard_cursor). @discard_state * and @discard_index are remembered as it may change while we're discarding, * but we want the discard to execute in the context determined here. */ @@ -234,10 +235,11 @@ again: return block_group; } -/** - * btrfs_discard_check_filter - updates a block groups filters - * @block_group: block group of interest - * @bytes: recently freed region size after coalescing +/* + * Update a block group's filters. + * + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing * * Async discard maintains multiple lists with progressively smaller filters * to prioritize discarding based on size. Should a free space that matches @@ -272,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group, } } -/** - * btrfs_update_discard_index - moves a block group along the discard lists +/* + * Move a block group along the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * @@ -292,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, add_to_discard_list(discard_ctl, block_group); } -/** - * btrfs_discard_cancel_work - remove a block_group from the discard lists +/* + * Remove a block_group from the discard lists. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This removes @block_group from the discard lists. If necessary, it waits on - * the current work and then reschedules the delayed work. + * Remove @block_group from the discard lists. If necessary, wait on the + * current work and then reschedule the delayed work. */ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -309,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_queue_work - handles queuing the block_groups +/* + * Handles queuing the block_groups. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This maintains the LRU order of the discard lists. + * Maintain the LRU order of the discard lists. */ void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -384,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, } /* - * btrfs_discard_schedule_work - responsible for scheduling the discard work + * Responsible for scheduling the discard work. + * * @discard_ctl: discard control * @override: override the current timer * @@ -402,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, spin_unlock(&discard_ctl->lock); } -/** - * btrfs_finish_discard_pass - determine next step of a block_group +/* + * Determine next step of a block_group. + * * @discard_ctl: discard control * @block_group: block_group of interest * - * This determines the next step for a block group after it's finished going - * through a pass on a discard list. If it is unused and fully trimmed, we can - * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto - * the appropriate filter list or let it fall off. + * Determine the next step for a block group after it's finished going through + * a pass on a discard list. If it is unused and fully trimmed, we can mark it + * unused and send it to the unused_bgs path. Otherwise, pass it onto the + * appropriate filter list or let it fall off. */ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) @@ -427,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, } } -/** - * btrfs_discard_workfn - discard work function +/* + * Discard work queue callback + * * @work: work * - * This finds the next block_group to start discarding and then discards a - * single region. It does this in a two-pass fashion: first extents and second + * Find the next block_group to start discarding and then discard a single + * region. It does this in a two-pass fashion: first extents and second * bitmaps. Completely discarded block groups are sent to the unused_bgs path. */ static void btrfs_discard_workfn(struct work_struct *work) @@ -508,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_run_discard_work - determines if async discard should be running +/* + * Determine if async discard should be running. + * * @discard_ctl: discard control * - * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + * Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. */ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) { @@ -524,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); } -/** - * btrfs_discard_calc_delay - recalculate the base delay +/* + * Recalculate the base delay. + * * @discard_ctl: discard control * * Recalculate the base delay which is based off the total number of @@ -546,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_lock(&discard_ctl->lock); /* - * The following is to fix a potential -1 discrepenancy that we're not + * The following is to fix a potential -1 discrepancy that we're not * sure how to reproduce. But given that this is the only place that * utilizes these numbers and this is only called by from * btrfs_finish_extent_commit() which is synchronized, we can correct @@ -579,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) spin_unlock(&discard_ctl->lock); } -/** - * btrfs_discard_update_discardable - propagate discard counters +/* + * Propagate discard counters. + * * @block_group: block_group of interest * - * This propagates deltas of counters up to the discard_ctl. It maintains a - * current counter and a previous counter passing the delta up to the global - * stat. Then the current counter value becomes the previous counter value. + * Propagate deltas of counters up to the discard_ctl. It maintains a current + * counter and a previous counter passing the delta up to the global stat. + * Then the current counter value becomes the previous counter value. */ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) { @@ -620,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group) } } -/** - * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists +/* + * Punt unused_bgs list to discard lists. + * * @fs_info: fs_info of interest * * The unused_bgs list needs to be punted to the discard lists because the @@ -645,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) spin_unlock(&fs_info->unused_bgs_lock); } -/** - * btrfs_discard_purge_list - purge discard lists +/* + * Purge discard lists. + * * @discard_ctl: discard control * * If we are disabling async discard, we may have intercepted block groups that diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4b47bb8c590f..545d9e1f0f83 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1298,7 +1298,7 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) bio_put(bio); } -/** +/* * Populate every free slot in a provided array with pages. * * @nr_pages: number of pages to allocate @@ -1334,16 +1334,16 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) return 0; } -/** - * Attempt to add a page to bio +/* + * Attempt to add a page to bio. * - * @bio_ctrl: record both the bio, and its bio_flags - * @page: page to add to the bio - * @disk_bytenr: offset of the new bio or to check whether we are adding - * a contiguous page to the previous one - * @size: portion of page that we want to write - * @pg_offset: starting offset in the page - * @compress_type: compression type of the current bio to see if we can merge them + * @bio_ctrl: record both the bio, and its bio_flags + * @page: page to add to the bio + * @disk_bytenr: offset of the new bio or to check whether we are adding + * a contiguous page to the previous one + * @size: portion of page that we want to write + * @pg_offset: starting offset in the page + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3066,7 +3066,7 @@ retry: return ret; } -/** +/* * Walk the list of dirty pages of the given address space and write all of them. * * @mapping: address space structure to write @@ -5460,11 +5460,12 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, *page_offset = offset_in_page(offset); } -/** - * extent_buffer_test_bit - determine whether a bit in a bitmap item is set - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @nr: bit number to test +/* + * Determine whether a bit in a bitmap item is set. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test */ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) @@ -5481,12 +5482,13 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } -/** - * extent_buffer_bitmap_set - set an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to set +/* + * Set an area of a bitmap to 1. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set */ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start, unsigned long pos, unsigned long len) @@ -5523,12 +5525,13 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star } -/** - * extent_buffer_bitmap_clear - clear an area of a bitmap - * @eb: the extent buffer - * @start: offset of the bitmap item in the extent buffer - * @pos: bit number of the first bit - * @len: number of bits to clear +/* + * Clear an area of a bitmap. + * + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear */ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, unsigned long start, unsigned long pos, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index f97508afb659..cd45303f344d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -28,12 +28,9 @@ void __cold extent_map_exit(void) kmem_cache_destroy(extent_map_cache); } -/** - * extent_map_tree_init - initialize extent map tree - * @tree: tree to initialize - * - * Initialize the extent tree @tree. Should be called for each new inode - * or other user of the extent_map interface. +/* + * Initialize the extent tree @tree. Should be called for each new inode or + * other user of the extent_map interface. */ void extent_map_tree_init(struct extent_map_tree *tree) { @@ -42,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree) rwlock_init(&tree->lock); } -/** - * alloc_extent_map - allocate new extent map structure - * - * Allocate a new extent_map structure. The new structure is - * returned with a reference count of one and needs to be - * freed using free_extent_map() +/* + * Allocate a new extent_map structure. The new structure is returned with a + * reference count of one and needs to be freed using free_extent_map() */ struct extent_map *alloc_extent_map(void) { @@ -62,12 +56,9 @@ struct extent_map *alloc_extent_map(void) return em; } -/** - * free_extent_map - drop reference count of an extent_map - * @em: extent map being released - * - * Drops the reference out on @em by one and free the structure - * if the reference count hits zero. +/* + * Drop the reference out on @em by one and free the structure if the reference + * count hits zero. */ void free_extent_map(struct extent_map *em) { @@ -82,7 +73,7 @@ void free_extent_map(struct extent_map *em) } } -/* simple helper to do math around the end of an extent, handling wrap */ +/* Do the math around the end of an extent, handling wrapping. */ static u64 range_end(u64 start, u64 len) { if (start + len < start) @@ -138,8 +129,8 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em) } /* - * search through the tree for an extent_map with a given offset. If - * it can't be found, try to find some neighboring extents + * Search through the tree for an extent_map with a given offset. If it can't + * be found, try to find some neighboring extents */ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, struct rb_node **prev_or_next_ret) @@ -191,7 +182,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* check to see if two extent_map structs are adjacent and safe to merge */ +/* Check to see if two extent_map structs are adjacent and safe to merge. */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) @@ -289,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) } } -/** - * unpin_extent_cache - unpin an extent from the cache +/* + * Unpin an extent from the cache. + * * @tree: tree to unpin the extent in * @start: logical offset in the file * @len: length of the extent @@ -393,7 +385,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) } } -/** +/* * Add new extent map to the extent tree * * @tree: tree to insert new map in @@ -452,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree, return em; } -/** - * lookup_extent_mapping - lookup extent_map +/* + * Lookup extent_map that intersects @start + @len range. + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -469,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 1); } -/** - * search_extent_mapping - find a nearby extent map +/* + * Find a nearby extent map intersecting @start + @len (not an exact search). + * * @tree: tree to lookup in * @start: byte offset to start the search * @len: length of the lookup range @@ -486,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, return __lookup_extent_mapping(tree, start, len, 0); } -/** - * remove_extent_mapping - removes an extent_map from the extent tree +/* + * Remove an extent_map from the extent tree. + * * @tree: extent tree to remove from * @em: extent map being removed * - * Removes @em from @tree. No reference counts are dropped, and no checks - * are done to see if the range is in use + * Remove @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use. */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { @@ -615,8 +610,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, return add_extent_mapping(em_tree, em, 0); } -/** - * Add extent mapping into em_tree +/* + * Add extent mapping into em_tree. * * @fs_info: the filesystem * @em_tree: extent tree into which we want to insert the extent mapping diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bce6c8d31bc0..403a857d230e 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -27,8 +27,8 @@ #define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \ PAGE_SIZE)) -/** - * Set inode's size according to filesystem options +/* + * Set inode's size according to filesystem options. * * @inode: inode we want to update the disk_i_size for * @new_i_size: i_size we want to set to, 0 if we use i_size @@ -67,8 +67,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz spin_unlock(&inode->lock); } -/** - * Mark range within a file as having a new extent inserted +/* + * Mark range within a file as having a new extent inserted. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -95,8 +95,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, EXTENT_DIRTY); } -/** - * Marks an inode range as not having a backing extent +/* + * Mark an inode range as not having a backing extent. * * @inode: inode being modified * @start: start file offset of the file extent we've inserted @@ -257,7 +257,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, /* * Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and - * estore the result to @dst. + * store the result to @dst. * * Return >0 for the number of sectors we found. * Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum @@ -363,15 +363,15 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode, return ret; } -/** +/* * Lookup the checksum for the read bio in csum tree. * - * @inode: inode that the bio is for. - * @bio: bio to look up. - * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return - * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If - * NULL, the checksum buffer is allocated and returned in - * btrfs_bio(bio)->csum instead. + * @inode: inode that the bio is for. + * @bio: bio to look up. + * @dst: Buffer of size nblocks * btrfs_super_csum_size() used to return + * checksum (nblocks = bio->bi_iter.bi_size / fs_info->sectorsize). If + * NULL, the checksum buffer is allocated and returned in + * btrfs_bio(bio)->csum instead. * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ @@ -633,8 +633,8 @@ fail: return ret; } -/** - * Calculate checksums of the data contained inside a bio +/* + * Calculate checksums of the data contained inside a bio. * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed @@ -749,15 +749,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, } /* - * helper function for csum removal, this expects the - * key to describe the csum pointed to by the path, and it expects - * the csum to overlap the range [bytenr, len] + * Remove one checksum overlapping a range. * - * The csum should not be entirely contained in the range and the - * range should not be entirely contained in the csum. + * This expects the key to describe the csum pointed to by the path, and it + * expects the csum to overlap the range [bytenr, len] * - * This calls btrfs_truncate_item with the correct args based on the - * overlap, and fixes up the key as required. + * The csum should not be entirely contained in the range and the range should + * not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the overlap, + * and fixes up the key as required. */ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, struct btrfs_path *path, @@ -806,8 +807,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info, } /* - * deletes the csum items from the csum tree for a given - * range of bytes. + * Delete the csum items from the csum tree for a given range of bytes. */ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 26ff1a5100b9..599b41523479 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1369,8 +1369,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans, path, block_group->start); } -/** - * Write out cached info to an inode +/* + * Write out cached info to an inode. * * @root: root the inode belongs to * @inode: freespace inode we are writing out @@ -3034,10 +3034,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) } -/** - * btrfs_is_free_space_trimmed - see if everything is trimmed - * @block_group: block_group of interest - * +/* * Walk @block_group's free space rb_tree to determine if everything is trimmed. */ bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ec9f0f4afd12..5b9857e30a08 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6895,18 +6895,18 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -/** - * btrfs_get_extent - Lookup the first extent overlapping a range in a file. +/* + * Lookup the first extent overlapping a range in a file. + * * @inode: file to search in * @page: page to read extent data into if the extent is inline * @pg_offset: offset into @page to copy to * @start: file offset * @len: length of range starting at @start * - * This returns the first &struct extent_map which overlaps with the given - * range, reading it from the B-tree and caching it if necessary. Note that - * there may be more extents which overlap the given range after the returned - * extent_map. + * Return the first &struct extent_map which overlaps the given range, reading + * it from the B-tree and caching it if necessary. Note that there may be more + * extents which overlap the given range after the returned extent_map. * * If @page is not NULL and the extent is inline, this also reads the extent * data directly into the page and marks the extent up to date in the io_tree. @@ -11310,7 +11310,7 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } -/** +/* * Verify that there are no ordered extents for a given file range. * * @inode: The target inode. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 52a268e05d4a..96858240588d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4346,7 +4346,7 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } -/** +/* * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as * required. * diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1cbaacdc50da..1c36407803ca 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -144,7 +144,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, return ret; } -/** +/* * Add an ordered extent to the per-inode tree. * * @inode: Inode that this extent is for. @@ -1020,17 +1020,18 @@ out: } /* - * btrfs_flush_ordered_range - Lock the passed range and ensures all pending - * ordered extents in it are run to completion. + * Lock the passed range and ensures all pending ordered extents in it are run + * to completion. * * @inode: Inode whose ordered tree is to be searched * @start: Beginning of range to flush * @end: Last byte of range to lock * @cached_state: If passed, will return the extent state responsible for the - * locked range. It's the caller's responsibility to free the cached state. + * locked range. It's the caller's responsibility to free the + * cached state. * - * This function always returns with the given range locked, ensuring after it's - * called no order extent can be pending. + * Always return with the given range locked, ensuring after it's called no + * order extent can be pending. */ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 349270537c2e..ef272e7d932c 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -908,8 +908,8 @@ static void raid_write_end_io(struct bio *bio) rbio_orig_end_io(rbio, err); } -/** - * Get a sector pointer specified by its @stripe_nr and @sector_nr +/* + * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0243eb33df7..9ba7914c7169 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -321,16 +321,16 @@ copy_to_page: goto out; } -/** - * btrfs_clone() - clone a range from inode file to another +/* + * Clone a range from inode file to another. * - * @src: Inode to clone from - * @inode: Inode to clone to - * @off: Offset within source to start clone from - * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen - * @destoff: Offset within @inode to start clone - * @no_time_update: Whether to update mtime/ctime on the target inode + * @src: Inode to clone from + * @inode: Inode to clone to + * @off: Offset within source to start clone from + * @olen: Original length, passed by user, of range to clone + * @olen_aligned: Block-aligned value of olen + * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index e5f9f43ffa4c..11e5b5a1eb5a 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1493,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); } -/** - * Do the appropriate flushing and waiting for a ticket +/* + * Do the appropriate flushing and waiting for a ticket. * * @fs_info: the filesystem * @space_info: space info for the reservation @@ -1596,8 +1596,8 @@ static inline bool can_ticket(enum btrfs_reserve_flush_enum flush) flush != BTRFS_RESERVE_FLUSH_EMERGENCY); } -/** - * Try to reserve bytes from the block_rsv's space +/* + * Try to reserve bytes from the block_rsv's space. * * @fs_info: the filesystem * @space_info: space info we want to allocate from @@ -1736,8 +1736,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, orig_bytes, flush); } -/** - * Trye to reserve metadata bytes from the block_rsv's space +/* + * Try to reserve metadata bytes from the block_rsv's space. * * @fs_info: the filesystem * @block_rsv: block_rsv we're allocating for @@ -1771,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, return ret; } -/** - * Try to reserve data bytes for an allocation +/* + * Try to reserve data bytes for an allocation. * * @fs_info: the filesystem * @bytes: number of bytes we need diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f185cd286243..e358df3668e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -7381,7 +7381,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, mutex_unlock(&dir->log_mutex); } -/** +/* * Update the log after adding a new name for an inode. * * @trans: Transaction handle. diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index f2f20c8d84aa..13af1e41f9d7 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -38,8 +38,9 @@ * loop would be similar to the above. */ -/** - * ulist_init - freshly initialize a ulist +/* + * Freshly initialize a ulist. + * * @ulist: the ulist to initialize * * Note: don't use this function to init an already used ulist, use @@ -52,8 +53,9 @@ void ulist_init(struct ulist *ulist) ulist->nnodes = 0; } -/** - * ulist_release - free up additionally allocated memory for the ulist +/* + * Free up additionally allocated memory for the ulist. + * * @ulist: the ulist from which to free the additional memory * * This is useful in cases where the base 'struct ulist' has been statically @@ -71,8 +73,9 @@ void ulist_release(struct ulist *ulist) INIT_LIST_HEAD(&ulist->nodes); } -/** - * ulist_reinit - prepare a ulist for reuse +/* + * Prepare a ulist for reuse. + * * @ulist: ulist to be reused * * Free up all additional memory allocated for the list elements and reinit @@ -84,8 +87,9 @@ void ulist_reinit(struct ulist *ulist) ulist_init(ulist); } -/** - * ulist_alloc - dynamically allocate a ulist +/* + * Dynamically allocate a ulist. + * * @gfp_mask: allocation flags to for base allocation * * The allocated ulist will be returned in an initialized state. @@ -102,8 +106,9 @@ struct ulist *ulist_alloc(gfp_t gfp_mask) return ulist; } -/** - * ulist_free - free dynamically allocated ulist +/* + * Free dynamically allocated ulist. + * * @ulist: ulist to free * * It is not necessary to call ulist_release before. @@ -164,8 +169,9 @@ static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) return 0; } -/** - * ulist_add - add an element to the ulist +/* + * Add an element to the ulist. + * * @ulist: ulist to add the element to * @val: value to add to ulist * @aux: auxiliary value to store along with val @@ -243,8 +249,9 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) return 0; } -/** - * ulist_next - iterate ulist +/* + * Iterate ulist. + * * @ulist: ulist to iterate * @uiter: iterator variable, initialized with ULIST_ITER_INIT(&iterator) * diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 767c56be6b96..f61bb79d4a7e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -531,14 +531,14 @@ error: return ret; } -/** - * Search and remove all stale devices (which are not mounted). - * When both inputs are NULL, it will search and release all stale devices. +/* + * Search and remove all stale devices (which are not mounted). When both + * inputs are NULL, it will search and release all stale devices. * - * @devt: Optional. When provided will it release all unmounted devices - * matching this devt only. + * @devt: Optional. When provided will it release all unmounted devices + * matching this devt only. * @skip_device: Optional. Will skip this device when searching for the stale - * devices. + * devices. * * Return: 0 for success or if @devt is 0. * -EBUSY if @devt is a mounted device. @@ -1478,8 +1478,9 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, return changed; } -/** - * dev_extent_hole_check - check if specified hole is suitable for allocation +/* + * Check if specified hole is suitable for allocation. + * * @device: the device which we have the hole * @hole_start: starting position of the hole * @hole_size: the size of the hole @@ -1533,7 +1534,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } /* - * find_free_dev_extent_start - find free space in the specified device + * Find free space in the specified device. + * * @device: the device which we search the free space in * @num_bytes: the size of the free space that we need * @search_start: the position from which to begin the search @@ -1541,9 +1543,8 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, * @len: the size of the free space. that we find, or the size * of the max free space if we don't find suitable free space * - * this uses a pretty simple search, the expectation is that it is - * called very infrequently and that a given device has a small number - * of extents + * This does a pretty simple search, the expectation is that it is called very + * infrequently and that a given device has a small number of extents. * * @start is used to store the start of the free space if we find. But if we * don't find suitable free space, it will be used to store the start position @@ -2322,8 +2323,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) btrfs_free_device(tgtdev); } -/** - * Populate args from device at path +/* + * Populate args from device at path. * * @fs_info: the filesystem * @args: the args to populate @@ -4031,10 +4032,11 @@ error: return ret; } -/** - * alloc_profile_is_valid - see if a given profile is valid and reduced - * @flags: profile to validate - * @extended: if true @flags is treated as an extended profile +/* + * See if a given profile is valid and reduced. + * + * @flags: profile to validate + * @extended: if true @flags is treated as an extended profile */ static int alloc_profile_is_valid(u64 flags, int extended) { @@ -7009,8 +7011,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, return device; } -/** - * btrfs_alloc_device - allocate struct btrfs_device +/* + * Allocate new device struct, set up devid and UUID. + * * @fs_info: used only for generating a new devid, can be NULL if * devid is provided (i.e. @devid != NULL). * @devid: a pointer to devid for this device. If NULL a new devid diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fe66cb929af9..13034c3bbb65 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1020,8 +1020,8 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); } -/** - * btrfs_find_allocatable_zones - find allocatable zones within a given region +/* + * Find allocatable zones within a given region. * * @device: the device to allocate a region on * @hole_start: the position of the hole to allocate the region @@ -1864,7 +1864,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, return device; } -/** +/* * Activate block group and underlying device zones * * @block_group: the block group to activate diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 35a0224d4eb7..4575b3703e74 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -94,7 +94,7 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/** +/* * Timer callback to free unused workspaces. * * @t: timer From cb9a10a6504bb35095e97f4839fcf353bf1458ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:16 -0400 Subject: [PATCH 098/249] btrfs: convert discard stat defs to enum Do away with the defines and use an enum as it's cleaner. Suggested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index cab954a9d97b..a855e0483e03 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -48,9 +48,11 @@ static inline bool btrfs_free_space_trimming_bitmap( * to make it clear what we're doing. An example is discard_extents in * btrfs_free_space_ctl. */ -#define BTRFS_STAT_NR_ENTRIES 2 -#define BTRFS_STAT_CURR 0 -#define BTRFS_STAT_PREV 1 +enum { + BTRFS_STAT_CURR, + BTRFS_STAT_PREV, + BTRFS_STAT_NR_ENTRIES, +}; struct btrfs_free_space_ctl { spinlock_t tree_lock; From b31bed170d5241d4c9e2fc572f31fc15b274a6c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:17 -0400 Subject: [PATCH 099/249] btrfs: move btrfs_chunk_item_size out of ctree.h This is used by the volumes code and the tree checker code. We want to maintain inline however, so simply move it to volumes.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ------- fs/btrfs/volumes.h | 8 ++++++++ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 09005bd9bfef..1e0944c37547 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -54,13 +54,6 @@ struct btrfs_balance_control; struct btrfs_delayed_root; struct reloc_control; -static inline unsigned long btrfs_chunk_item_size(int num_stripes) -{ - BUG_ON(num_stripes == 0); - return sizeof(struct btrfs_chunk) + - sizeof(struct btrfs_stripe) * (num_stripes - 1); -} - /* Read ahead values for struct btrfs_path.reada */ enum { READA_NONE, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index a20ee7d57831..b05124e62412 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -10,6 +10,7 @@ #include #include #include "async-thread.h" +#include "messages.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -605,6 +606,13 @@ static inline enum btrfs_map_op btrfs_op(struct bio *bio) } } +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + ASSERT(num_stripes); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + void btrfs_get_bioc(struct btrfs_io_context *bioc); void btrfs_put_bioc(struct btrfs_io_context *bioc); int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, From 3683fbbc2314d6721a353f8bf4285a819ea9b512 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:18 -0400 Subject: [PATCH 100/249] btrfs: add dependencies to fs.h and block-rsv.h There's several structures that are embedded inside of fs_info.h, so if we don't have all the proper includes when we include fs.h we'll get a variety of compile errors. I fixed this by adding a temporary c file that just had #include "fs.h" and then added include files until the compiler stopped complaining. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-rsv.h | 1 + fs/btrfs/fs.h | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 7e9016a9e193..e2cf0dd43203 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -4,6 +4,7 @@ #define BTRFS_BLOCK_RSV_H struct btrfs_trans_handle; +struct btrfs_root; enum btrfs_reserve_flush_enum; /* diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index d8c931014c2a..a49d11127bf7 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,14 @@ #ifndef BTRFS_FS_H #define BTRFS_FS_H +#include +#include +#include +#include "extent-io-tree.h" +#include "extent_map.h" +#include "async-thread.h" +#include "block-rsv.h" + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL From 5034388342564877879e2a94faf72eff468a7437 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:19 -0400 Subject: [PATCH 101/249] btrfs: add blk_types.h include to compression.h When moving the printk messages into their own file I got a compiler error because the includes grabbed compression.h, but nothing pulled in the blk_types.h dependency that compression.h has because it uses blkstatus_t. Add blk_types.h to compression.h so that this sort of thing doesn't happen in the future. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.h | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9da2343eeff5..b961462399ae 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -6,6 +6,7 @@ #ifndef BTRFS_COMPRESSION_H #define BTRFS_COMPRESSION_H +#include #include struct btrfs_inode; From 083bd7e54e8e2c1d169f21236873272cdf6f409a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:20 -0400 Subject: [PATCH 102/249] btrfs: move the printk and assert helpers to messages.c These helpers are core to btrfs, and in order to more easily sync various parts of the btrfs kernel code into btrfs-progs we need to be able to carry these helpers with us. However we want to have our own implementation for the helpers themselves, currently they're implemented in different files that we want to sync inside of btrfs-progs itself. Move these into their own C file, this will allow us to contain our overrides in btrfs-progs in it's own file without messing with the rest of the codebase. In copying things over I fixed up a few whitespace errors that already existed. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/messages.c | 352 ++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/super.c | 346 ------------------------------------------- 3 files changed, 353 insertions(+), 347 deletions(-) create mode 100644 fs/btrfs/messages.c diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 76f90dcfb14d..bb170c9d030d 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c new file mode 100644 index 000000000000..196757ee16f1 --- /dev/null +++ b/fs/btrfs/messages.c @@ -0,0 +1,352 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "fs.h" +#include "messages.h" +#include "discard.h" +#include "transaction.h" +#include "space-info.h" + +#ifdef CONFIG_PRINTK + +#define STATE_STRING_PREFACE ": state " +#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) + +/* + * Characters to print to indicate error conditions or uncommon filesystem state. + * RO is not an error. + */ +static const char fs_state_chars[] = { + [BTRFS_FS_STATE_ERROR] = 'E', + [BTRFS_FS_STATE_REMOUNTING] = 'M', + [BTRFS_FS_STATE_RO] = 0, + [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', + [BTRFS_FS_STATE_DEV_REPLACING] = 'R', + [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, + [BTRFS_FS_STATE_NO_CSUMS] = 'C', + [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', +}; + +static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) +{ + unsigned int bit; + bool states_printed = false; + unsigned long fs_state = READ_ONCE(info->fs_state); + char *curr = buf; + + memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); + curr += sizeof(STATE_STRING_PREFACE) - 1; + + for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { + WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); + if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { + *curr++ = fs_state_chars[bit]; + states_printed = true; + } + } + + /* If no states were printed, reset the buffer */ + if (!states_printed) + curr = buf; + + *curr++ = 0; +} +#endif + +/* + * Generally the error codes correspond to their respective errors, but there + * are a few special cases. + * + * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for + * instance will return EUCLEAN if any of the blocks are corrupted in + * a way that is problematic. We want to reserve EUCLEAN for these + * sort of corruptions. + * + * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we + * need to use EROFS for this case. We will have no idea of the + * original failure, that will have been reported at the time we tripped + * over the error. Each subsequent error that doesn't have any context + * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. + */ +const char * __attribute_const__ btrfs_decode_error(int errno) +{ + char *errstr = "unknown"; + + switch (errno) { + case -ENOENT: /* -2 */ + errstr = "No such entry"; + break; + case -EIO: /* -5 */ + errstr = "IO failure"; + break; + case -ENOMEM: /* -12*/ + errstr = "Out of memory"; + break; + case -EEXIST: /* -17 */ + errstr = "Object already exists"; + break; + case -ENOSPC: /* -28 */ + errstr = "No space left"; + break; + case -EROFS: /* -30 */ + errstr = "Readonly filesystem"; + break; + case -EOPNOTSUPP: /* -95 */ + errstr = "Operation not supported"; + break; + case -EUCLEAN: /* -117 */ + errstr = "Filesystem corrupted"; + break; + case -EDQUOT: /* -122 */ + errstr = "Quota exceeded"; + break; + } + + return errstr; +} + +/* + * __btrfs_handle_fs_error decodes expected errors from the caller and + * invokes the appropriate error response. + */ +__cold +void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + struct super_block *sb = fs_info->sb; +#ifdef CONFIG_PRINTK + char statestr[STATE_STRING_BUF_LEN]; + const char *errstr; +#endif + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit( + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt); +#endif + + /* + * Special case: if the error is EROFS, and we're already under + * SB_RDONLY, then it is safe here. + */ + if (errno == -EROFS && sb_rdonly(sb)) + return; + +#ifdef CONFIG_PRINTK + errstr = btrfs_decode_error(errno); + btrfs_state_to_string(fs_info, statestr); + if (fmt) { + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", + sb->s_id, statestr, function, line, errno, errstr, &vaf); + va_end(args); + } else { + pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", + sb->s_id, statestr, function, line, errno, errstr); + } +#endif + + /* + * Today we only save the error info to memory. Long term we'll also + * send it down to the disk. + */ + set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); + + /* Don't go through full error handling during mount. */ + if (!(sb->s_flags & SB_BORN)) + return; + + if (sb_rdonly(sb)) + return; + + btrfs_discard_stop(fs_info); + + /* Handle error by forcing the filesystem readonly. */ + btrfs_set_sb_rdonly(sb); + btrfs_info(fs_info, "forced readonly"); + /* + * Note that a running device replace operation is not canceled here + * although there is no way to update the progress. It would add the + * risk of a deadlock, therefore the canceling is omitted. The only + * penalty is that some I/O remains active until the procedure + * completes. The next time when the filesystem is mounted writable + * again, the device replace operation continues. + */ +} + +#ifdef CONFIG_PRINTK +static const char * const logtypes[] = { + "emergency", + "alert", + "critical", + "error", + "warning", + "notice", + "info", + "debug", +}; + +/* + * Use one ratelimit state per log level so that a flood of less important + * messages doesn't cause more important ones to be dropped. + */ +static struct ratelimit_state printk_limits[] = { + RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), + RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), +}; + +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +{ + char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; + struct va_format vaf; + va_list args; + int kern_level; + const char *type = logtypes[4]; + struct ratelimit_state *ratelimit = &printk_limits[4]; + +#ifdef CONFIG_PRINTK_INDEX + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); +#endif + + va_start(args, fmt); + + while ((kern_level = printk_get_level(fmt)) != 0) { + size_t size = printk_skip_level(fmt) - fmt; + + if (kern_level >= '0' && kern_level <= '7') { + memcpy(lvl, fmt, size); + lvl[size] = '\0'; + type = logtypes[kern_level - '0']; + ratelimit = &printk_limits[kern_level - '0']; + } + fmt += size; + } + + vaf.fmt = fmt; + vaf.va = &args; + + if (__ratelimit(ratelimit)) { + if (fs_info) { + char statestr[STATE_STRING_BUF_LEN]; + + btrfs_state_to_string(fs_info, statestr); + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + fs_info->sb->s_id, statestr, &vaf); + } else { + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + } + } + + va_end(args); +} +#endif + +#ifdef CONFIG_BTRFS_ASSERT +void __cold btrfs_assertfail(const char *expr, const char *file, int line) +{ + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); +} +#endif + +void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) +{ + btrfs_err(fs_info, +"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); +} + +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + +/* + * We only mark the transaction aborted and then set the file system read-only. + * This will prevent new transactions from starting or trying to join this + * one. + * + * This means that error recovery at the call site is limited to freeing + * any local memory allocations and passing the error code up without + * further cleanup. The transaction should complete as it normally would + * in the call path but will return -EIO. + * + * We'll complete the cleanup in btrfs_end_transaction and + * btrfs_commit_transaction. + */ +__cold +void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, + const char *function, + unsigned int line, int errno, bool first_hit) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + + WRITE_ONCE(trans->aborted, errno); + WRITE_ONCE(trans->transaction->aborted, errno); + if (first_hit && errno == -ENOSPC) + btrfs_dump_space_info_for_trans_abort(fs_info); + /* Wake up anybody who may be waiting on this transaction */ + wake_up(&fs_info->transaction_wait); + wake_up(&fs_info->transaction_blocked_wait); + __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); +} + +/* + * __btrfs_panic decodes unexpected, fatal errors from the caller, issues an + * alert, and either panics or BUGs, depending on mount options. + */ +__cold +void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, + unsigned int line, int errno, const char *fmt, ...) +{ + char *s_id = ""; + const char *errstr; + struct va_format vaf = { .fmt = fmt }; + va_list args; + + if (fs_info) + s_id = fs_info->sb->s_id; + + va_start(args, fmt); + vaf.va = &args; + + errstr = btrfs_decode_error(errno); + if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) + panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", + s_id, function, line, &vaf, errno, errstr); + + btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", + function, line, &vaf, errno, errstr); + va_end(args); + /* Caller calls BUG() */ +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 33bf211699f8..a4030dfeb2f2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -70,352 +70,6 @@ static struct file_system_type btrfs_root_fs_type; static int btrfs_remount(struct super_block *sb, int *flags, char *data); -#ifdef CONFIG_PRINTK - -#define STATE_STRING_PREFACE ": state " -#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) - -/* - * Characters to print to indicate error conditions or uncommon filesystem state. - * RO is not an error. - */ -static const char fs_state_chars[] = { - [BTRFS_FS_STATE_ERROR] = 'E', - [BTRFS_FS_STATE_REMOUNTING] = 'M', - [BTRFS_FS_STATE_RO] = 0, - [BTRFS_FS_STATE_TRANS_ABORTED] = 'A', - [BTRFS_FS_STATE_DEV_REPLACING] = 'R', - [BTRFS_FS_STATE_DUMMY_FS_INFO] = 0, - [BTRFS_FS_STATE_NO_CSUMS] = 'C', - [BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L', -}; - -static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) -{ - unsigned int bit; - bool states_printed = false; - unsigned long fs_state = READ_ONCE(info->fs_state); - char *curr = buf; - - memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); - curr += sizeof(STATE_STRING_PREFACE) - 1; - - for_each_set_bit(bit, &fs_state, sizeof(fs_state)) { - WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT); - if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) { - *curr++ = fs_state_chars[bit]; - states_printed = true; - } - } - - /* If no states were printed, reset the buffer */ - if (!states_printed) - curr = buf; - - *curr++ = 0; -} -#endif - -/* - * Generally the error codes correspond to their respective errors, but there - * are a few special cases. - * - * EUCLEAN: Any sort of corruption that we encounter. The tree-checker for - * instance will return EUCLEAN if any of the blocks are corrupted in - * a way that is problematic. We want to reserve EUCLEAN for these - * sort of corruptions. - * - * EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we - * need to use EROFS for this case. We will have no idea of the - * original failure, that will have been reported at the time we tripped - * over the error. Each subsequent error that doesn't have any context - * of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR. - */ -const char * __attribute_const__ btrfs_decode_error(int errno) -{ - char *errstr = "unknown"; - - switch (errno) { - case -ENOENT: /* -2 */ - errstr = "No such entry"; - break; - case -EIO: /* -5 */ - errstr = "IO failure"; - break; - case -ENOMEM: /* -12*/ - errstr = "Out of memory"; - break; - case -EEXIST: /* -17 */ - errstr = "Object already exists"; - break; - case -ENOSPC: /* -28 */ - errstr = "No space left"; - break; - case -EROFS: /* -30 */ - errstr = "Readonly filesystem"; - break; - case -EOPNOTSUPP: /* -95 */ - errstr = "Operation not supported"; - break; - case -EUCLEAN: /* -117 */ - errstr = "Filesystem corrupted"; - break; - case -EDQUOT: /* -122 */ - errstr = "Quota exceeded"; - break; - } - - return errstr; -} - -/* - * __btrfs_handle_fs_error decodes expected errors from the caller and - * invokes the appropriate error response. - */ -__cold -void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - struct super_block *sb = fs_info->sb; -#ifdef CONFIG_PRINTK - char statestr[STATE_STRING_BUF_LEN]; - const char *errstr; -#endif - -#ifdef CONFIG_PRINTK_INDEX - printk_index_subsys_emit( - "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", - KERN_CRIT, fmt); -#endif - - /* - * Special case: if the error is EROFS, and we're already - * under SB_RDONLY, then it is safe here. - */ - if (errno == -EROFS && sb_rdonly(sb)) - return; - -#ifdef CONFIG_PRINTK - errstr = btrfs_decode_error(errno); - btrfs_state_to_string(fs_info, statestr); - if (fmt) { - struct va_format vaf; - va_list args; - - va_start(args, fmt); - vaf.fmt = fmt; - vaf.va = &args; - - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n", - sb->s_id, statestr, function, line, errno, errstr, &vaf); - va_end(args); - } else { - pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n", - sb->s_id, statestr, function, line, errno, errstr); - } -#endif - - /* - * Today we only save the error info to memory. Long term we'll - * also send it down to the disk - */ - set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state); - - /* Don't go through full error handling during mount */ - if (!(sb->s_flags & SB_BORN)) - return; - - if (sb_rdonly(sb)) - return; - - btrfs_discard_stop(fs_info); - - /* btrfs handle error by forcing the filesystem readonly */ - btrfs_set_sb_rdonly(sb); - btrfs_info(fs_info, "forced readonly"); - /* - * Note that a running device replace operation is not canceled here - * although there is no way to update the progress. It would add the - * risk of a deadlock, therefore the canceling is omitted. The only - * penalty is that some I/O remains active until the procedure - * completes. The next time when the filesystem is mounted writable - * again, the device replace operation continues. - */ -} - -#ifdef CONFIG_PRINTK -static const char * const logtypes[] = { - "emergency", - "alert", - "critical", - "error", - "warning", - "notice", - "info", - "debug", -}; - - -/* - * Use one ratelimit state per log level so that a flood of less important - * messages doesn't cause more important ones to be dropped. - */ -static struct ratelimit_state printk_limits[] = { - RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100), - RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), -}; - -void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) -{ - char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; - struct va_format vaf; - va_list args; - int kern_level; - const char *type = logtypes[4]; - struct ratelimit_state *ratelimit = &printk_limits[4]; - -#ifdef CONFIG_PRINTK_INDEX - printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); -#endif - - va_start(args, fmt); - - while ((kern_level = printk_get_level(fmt)) != 0) { - size_t size = printk_skip_level(fmt) - fmt; - - if (kern_level >= '0' && kern_level <= '7') { - memcpy(lvl, fmt, size); - lvl[size] = '\0'; - type = logtypes[kern_level - '0']; - ratelimit = &printk_limits[kern_level - '0']; - } - fmt += size; - } - - vaf.fmt = fmt; - vaf.va = &args; - - if (__ratelimit(ratelimit)) { - if (fs_info) { - char statestr[STATE_STRING_BUF_LEN]; - - btrfs_state_to_string(fs_info, statestr); - _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, - fs_info->sb->s_id, statestr, &vaf); - } else { - _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); - } - } - - va_end(args); -} -#endif - -#ifdef CONFIG_BTRFS_ASSERT -void __cold btrfs_assertfail(const char *expr, const char *file, int line) -{ - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); -} -#endif - -void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info) -{ - btrfs_err(fs_info, -"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel"); -} - -#if BITS_PER_LONG == 32 -void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { - btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); - btrfs_warn(fs_info, -"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_warn(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} - -void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) -{ - if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { - btrfs_err(fs_info, "reached 32bit limit for logical addresses"); - btrfs_err(fs_info, -"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", - BTRFS_32BIT_MAX_FILE_SIZE >> 40); - btrfs_err(fs_info, - "please consider upgrading to 64bit kernel/hardware"); - } -} -#endif - -/* - * We only mark the transaction aborted and then set the file system read-only. - * This will prevent new transactions from starting or trying to join this - * one. - * - * This means that error recovery at the call site is limited to freeing - * any local memory allocations and passing the error code up without - * further cleanup. The transaction should complete as it normally would - * in the call path but will return -EIO. - * - * We'll complete the cleanup in btrfs_end_transaction and - * btrfs_commit_transaction. - */ -__cold -void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, - const char *function, - unsigned int line, int errno, bool first_hit) -{ - struct btrfs_fs_info *fs_info = trans->fs_info; - - WRITE_ONCE(trans->aborted, errno); - WRITE_ONCE(trans->transaction->aborted, errno); - if (first_hit && errno == -ENOSPC) - btrfs_dump_space_info_for_trans_abort(fs_info); - /* Wake up anybody who may be waiting on this transaction */ - wake_up(&fs_info->transaction_wait); - wake_up(&fs_info->transaction_blocked_wait); - __btrfs_handle_fs_error(fs_info, function, line, errno, NULL); -} -/* - * __btrfs_panic decodes unexpected, fatal errors from the caller, - * issues an alert, and either panics or BUGs, depending on mount options. - */ -__cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, - unsigned int line, int errno, const char *fmt, ...) -{ - char *s_id = ""; - const char *errstr; - struct va_format vaf = { .fmt = fmt }; - va_list args; - - if (fs_info) - s_id = fs_info->sb->s_id; - - va_start(args, fmt); - vaf.va = &args; - - errstr = btrfs_decode_error(errno); - if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR))) - panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n", - s_id, function, line, &vaf, errno, errstr); - - btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)", - function, line, &vaf, errno, errstr); - va_end(args); - /* Caller calls BUG() */ -} - static void btrfs_put_super(struct super_block *sb) { close_ctree(btrfs_sb(sb)); From 2885fd632050ef47317a97a3a68bc98634a6e11e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:21 -0400 Subject: [PATCH 103/249] btrfs: move inode prototypes to btrfs_inode.h I initially wanted to make a new header file for this, but these prototypes do naturally fit into btrfs_inode.h. If we want to extract vfs from pure btrfs code in the future we may need to split this up, but btrfs_inode embeds the vfs_inode, so it makes sense to put the prototypes in this header for now. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 136 +++++++++++++++++++++++++++++++++++++++ fs/btrfs/ctree.h | 140 ----------------------------------------- 2 files changed, 136 insertions(+), 140 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 530a0ebfab3f..d21c30bf7053 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -410,4 +410,140 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes + +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); +unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, + u64 start, u64 end); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); +noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, + u64 *orig_start, u64 *orig_block_len, + u64 *ram_bytes, bool nowait, bool strict); + +void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode); +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_inode *dir, struct btrfs_inode *inode, + const struct fscrypt_str *name); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct btrfs_inode *parent_inode, struct btrfs_inode *inode, + const struct fscrypt_str *name, int add_backref, u64 index); +int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); +int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, + int front); + +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, + bool in_reclaim_context); +int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, + unsigned int extra_bits, + struct extent_state **cached_state); + +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */ + struct posix_acl *default_acl; + struct posix_acl *acl; + struct fscrypt_name fname; +}; + +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); + void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + u32 bits); +void btrfs_clear_delalloc_extent(struct inode *inode, + struct extent_state *state, u32 bits); +void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, + struct extent_state *other); +void btrfs_split_delalloc_extent(struct inode *inode, + struct extent_state *orig, u64 split); +void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); +void btrfs_evict_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +void btrfs_free_inode(struct inode *inode); +int btrfs_drop_inode(struct inode *inode); +int __init btrfs_init_cachep(void); +void __cold btrfs_destroy_cachep(void); +struct inode *btrfs_iget_path(struct super_block *s, u64 ino, + struct btrfs_root *root, struct btrfs_path *path); +struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); +struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + struct page *page, size_t pg_offset, + u64 start, u64 end); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); +int btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); +int btrfs_prealloc_file_range(struct inode *inode, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_prealloc_file_range_trans(struct inode *inode, + struct btrfs_trans_handle *trans, int mode, + u64 start, u64 num_bytes, u64 min_size, + loff_t actual_len, u64 *alloc_hint); +int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, struct writeback_control *wbc); +int btrfs_writepage_cow_fixup(struct page *page); +void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, + struct page *page, u64 start, + u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); +struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + +extern const struct dentry_operations btrfs_dentry_operations; + +/* Inode locking type flags, by default the exclusive lock is taken. */ +enum btrfs_ilock_type { + ENUM_BIT(BTRFS_ILOCK_SHARED), + ENUM_BIT(BTRFS_ILOCK_TRY), + ENUM_BIT(BTRFS_ILOCK_MMAP), +}; + +int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, + const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); + #endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1e0944c37547..a9267b117860 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -764,146 +764,6 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); -/* inode.c */ -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, - int mirror_num, enum btrfs_compression_type compress_type); -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, - u64 start, u64 end); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, - u64 *orig_start, u64 *orig_block_len, - u64 *ram_bytes, bool nowait, bool strict); - -void __btrfs_del_delalloc_inode(struct btrfs_root *root, - struct btrfs_inode *inode); -struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); -int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index); -int btrfs_unlink_inode(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct btrfs_inode *inode, - const struct fscrypt_str *name); -int btrfs_add_link(struct btrfs_trans_handle *trans, - struct btrfs_inode *parent_inode, struct btrfs_inode *inode, - const struct fscrypt_str *name, int add_backref, u64 index); -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); - -int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, - bool in_reclaim_context); -int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, - unsigned int extra_bits, - struct extent_state **cached_state); -struct btrfs_new_inode_args { - /* Input */ - struct inode *dir; - struct dentry *dentry; - struct inode *inode; - bool orphan; - bool subvol; - - /* - * Output from btrfs_new_inode_prepare(), input to - * btrfs_create_new_inode(). - */ - struct posix_acl *default_acl; - struct posix_acl *acl; - struct fscrypt_name fname; -}; -int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, - unsigned int *trans_num_items); -int btrfs_create_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_new_inode_args *args); -void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); -struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, - struct inode *dir); - void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - u32 bits); -void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, u32 bits); -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, - struct extent_state *other); -void btrfs_split_delalloc_extent(struct inode *inode, - struct extent_state *orig, u64 split); -void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); -vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -void btrfs_evict_inode(struct inode *inode); -struct inode *btrfs_alloc_inode(struct super_block *sb); -void btrfs_destroy_inode(struct inode *inode); -void btrfs_free_inode(struct inode *inode); -int btrfs_drop_inode(struct inode *inode); -int __init btrfs_init_cachep(void); -void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(struct super_block *s, u64 ino, - struct btrfs_root *root, struct btrfs_path *path); -struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); -struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 end); -int btrfs_update_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode); -int btrfs_orphan_add(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); -int btrfs_orphan_cleanup(struct btrfs_root *root); -int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); -void btrfs_add_delayed_iput(struct inode *inode); -void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); -int btrfs_prealloc_file_range(struct inode *inode, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_prealloc_file_range_trans(struct inode *inode, - struct btrfs_trans_handle *trans, int mode, - u64 start, u64 num_bytes, u64 min_size, - loff_t actual_len, u64 *alloc_hint); -int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page, - u64 start, u64 end, int *page_started, unsigned long *nr_written, - struct writeback_control *wbc); -int btrfs_writepage_cow_fixup(struct page *page); -void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, - struct page *page, u64 start, - u64 end, bool uptodate); -int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, - int compress_type); -int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, u64 disk_bytenr, - u64 disk_io_size, - struct page **pages); -ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, - struct btrfs_ioctl_encoded_io_args *encoded); -ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); - -ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); -struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, - size_t done_before); - -extern const struct dentry_operations btrfs_dentry_operations; - -/* Inode locking type flags, by default the exclusive lock is taken */ -enum btrfs_ilock_type { - ENUM_BIT(BTRFS_ILOCK_SHARED), - ENUM_BIT(BTRFS_ILOCK_TRY), - ENUM_BIT(BTRFS_ILOCK_MMAP), -}; - -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); -void btrfs_update_inode_bytes(struct btrfs_inode *inode, - const u64 add_bytes, - const u64 del_bytes); -void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); - /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); From 778dd695dd4d5a21eff07bb1570b570da69dfbd9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:22 -0400 Subject: [PATCH 104/249] btrfs: rename tree-defrag.c to defrag.c This currently has only one helper in it, and it's for tree based defrag. We have the various defrag code in 3 different places, so rename this to defrag.c. Followup patches will move the code into this new file. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/{tree-defrag.c => defrag.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename fs/btrfs/{tree-defrag.c => defrag.c} (100%) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index bb170c9d030d..84fb3b4c35b0 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -23,7 +23,7 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ file-item.o inode-item.o disk-io.o \ - transaction.o inode.o file.o tree-defrag.o \ + transaction.o inode.o file.o defrag.o \ extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/defrag.c similarity index 100% rename from fs/btrfs/tree-defrag.c rename to fs/btrfs/defrag.c From 6e3df18ba7e8e68015dd66bcab326a4b7aaed085 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:23 -0400 Subject: [PATCH 105/249] btrfs: move the auto defrag code to defrag.c This currently exists in file.c, move it to the more natural location in defrag.c. Signed-off-by: Josef Bacik [ reformat comments ] Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 337 +++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 340 ---------------------------------------------- 2 files changed, 337 insertions(+), 340 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5df604846de6..4c8c03185ce7 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -11,6 +11,326 @@ #include "locking.h" #include "accessors.h" +static struct kmem_cache *btrfs_inode_defrag_cachep; + +/* + * When auto defrag is enabled we queue up these defrag structs to remember + * which inodes need defragging passes. + */ +struct inode_defrag { + struct rb_node rb_node; + /* Inode number */ + u64 ino; + /* + * Transid where the defrag was added, we search for extents newer than + * this. + */ + u64 transid; + + /* Root objectid */ + u64 root; + + /* + * The extent size threshold for autodefrag. + * + * This value is different for compressed/non-compressed extents, thus + * needs to be passed from higher layer. + * (aka, inode_should_defrag()) + */ + u32 extent_thresh; +}; + +static int __compare_inode_defrag(struct inode_defrag *defrag1, + struct inode_defrag *defrag2) +{ + if (defrag1->root > defrag2->root) + return 1; + else if (defrag1->root < defrag2->root) + return -1; + else if (defrag1->ino > defrag2->ino) + return 1; + else if (defrag1->ino < defrag2->ino) + return -1; + else + return 0; +} + +/* + * Pop a record for an inode into the defrag tree. The lock must be held + * already. + * + * If you're inserting a record for an older transid than an existing record, + * the transid already in the tree is lowered. + * + * If an existing record is found the defrag item you pass in is freed. + */ +static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, + struct inode_defrag *defrag) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct inode_defrag *entry; + struct rb_node **p; + struct rb_node *parent = NULL; + int ret; + + p = &fs_info->defrag_inodes.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(defrag, entry); + if (ret < 0) + p = &parent->rb_left; + else if (ret > 0) + p = &parent->rb_right; + else { + /* + * If we're reinserting an entry for an old defrag run, + * make sure to lower the transid of our existing + * record. + */ + if (defrag->transid < entry->transid) + entry->transid = defrag->transid; + entry->extent_thresh = min(defrag->extent_thresh, + entry->extent_thresh); + return -EEXIST; + } + } + set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); + rb_link_node(&defrag->rb_node, parent, p); + rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); + return 0; +} + +static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) + return 0; + + if (btrfs_fs_closing(fs_info)) + return 0; + + return 1; +} + +/* + * Insert a defrag record for this inode if auto defrag is enabled. + */ +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh) +{ + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode_defrag *defrag; + u64 transid; + int ret; + + if (!__need_auto_defrag(fs_info)) + return 0; + + if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) + return 0; + + if (trans) + transid = trans->transid; + else + transid = inode->root->last_trans; + + defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); + if (!defrag) + return -ENOMEM; + + defrag->ino = btrfs_ino(inode); + defrag->transid = transid; + defrag->root = root->root_key.objectid; + defrag->extent_thresh = extent_thresh; + + spin_lock(&fs_info->defrag_inodes_lock); + if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { + /* + * If we set IN_DEFRAG flag and evict the inode from memory, + * and then re-read this inode, this new inode doesn't have + * IN_DEFRAG flag. At the case, we may find the existed defrag. + */ + ret = __btrfs_add_inode_defrag(inode, defrag); + if (ret) + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } else { + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + } + spin_unlock(&fs_info->defrag_inodes_lock); + return 0; +} + +/* + * Pick the defragable inode that we want, if it doesn't exist, we will get the + * next one. + */ +static struct inode_defrag *btrfs_pick_defrag_inode( + struct btrfs_fs_info *fs_info, u64 root, u64 ino) +{ + struct inode_defrag *entry = NULL; + struct inode_defrag tmp; + struct rb_node *p; + struct rb_node *parent = NULL; + int ret; + + tmp.ino = ino; + tmp.root = root; + + spin_lock(&fs_info->defrag_inodes_lock); + p = fs_info->defrag_inodes.rb_node; + while (p) { + parent = p; + entry = rb_entry(parent, struct inode_defrag, rb_node); + + ret = __compare_inode_defrag(&tmp, entry); + if (ret < 0) + p = parent->rb_left; + else if (ret > 0) + p = parent->rb_right; + else + goto out; + } + + if (parent && __compare_inode_defrag(&tmp, entry) > 0) { + parent = rb_next(parent); + if (parent) + entry = rb_entry(parent, struct inode_defrag, rb_node); + else + entry = NULL; + } +out: + if (entry) + rb_erase(parent, &fs_info->defrag_inodes); + spin_unlock(&fs_info->defrag_inodes_lock); + return entry; +} + +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + struct rb_node *node; + + spin_lock(&fs_info->defrag_inodes_lock); + node = rb_first(&fs_info->defrag_inodes); + while (node) { + rb_erase(node, &fs_info->defrag_inodes); + defrag = rb_entry(node, struct inode_defrag, rb_node); + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + + cond_resched_lock(&fs_info->defrag_inodes_lock); + + node = rb_first(&fs_info->defrag_inodes); + } + spin_unlock(&fs_info->defrag_inodes_lock); +} + +#define BTRFS_DEFRAG_BATCH 1024 + +static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, + struct inode_defrag *defrag) +{ + struct btrfs_root *inode_root; + struct inode *inode; + struct btrfs_ioctl_defrag_range_args range; + int ret = 0; + u64 cur = 0; + +again: + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + goto cleanup; + if (!__need_auto_defrag(fs_info)) + goto cleanup; + + /* Get the inode */ + inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); + if (IS_ERR(inode_root)) { + ret = PTR_ERR(inode_root); + goto cleanup; + } + + inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); + btrfs_put_root(inode_root); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto cleanup; + } + + if (cur >= i_size_read(inode)) { + iput(inode); + goto cleanup; + } + + /* Do a chunk of defrag */ + clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + memset(&range, 0, sizeof(range)); + range.len = (u64)-1; + range.start = cur; + range.extent_thresh = defrag->extent_thresh; + + sb_start_write(fs_info->sb); + ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, + BTRFS_DEFRAG_BATCH); + sb_end_write(fs_info->sb); + iput(inode); + + if (ret < 0) + goto cleanup; + + cur = max(cur + fs_info->sectorsize, range.start); + goto again; + +cleanup: + kmem_cache_free(btrfs_inode_defrag_cachep, defrag); + return ret; +} + +/* + * Run through the list of inodes in the FS that need defragging. + */ +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) +{ + struct inode_defrag *defrag; + u64 first_ino = 0; + u64 root_objectid = 0; + + atomic_inc(&fs_info->defrag_running); + while (1) { + /* Pause the auto defragger. */ + if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) + break; + + if (!__need_auto_defrag(fs_info)) + break; + + /* find an inode to defrag */ + defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, first_ino); + if (!defrag) { + if (root_objectid || first_ino) { + root_objectid = 0; + first_ino = 0; + continue; + } else { + break; + } + } + + first_ino = defrag->ino + 1; + root_objectid = defrag->root; + + __btrfs_run_defrag_inode(fs_info, defrag); + } + atomic_dec(&fs_info->defrag_running); + + /* + * During unmount, we use the transaction_wait queue to wait for the + * defragger to stop. + */ + wake_up(&fs_info->transaction_wait); + return 0; +} + /* * Defrag all the leaves in a given btree. * Read all the leaves and try to get key order to @@ -131,3 +451,20 @@ done: return ret; } + +void __cold btrfs_auto_defrag_exit(void) +{ + kmem_cache_destroy(btrfs_inode_defrag_cachep); +} + +int __init btrfs_auto_defrag_init(void) +{ + btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", + sizeof(struct inode_defrag), 0, + SLAB_MEM_SPREAD, + NULL); + if (!btrfs_inode_defrag_cachep) + return -ENOMEM; + + return 0; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index bad676d2e23f..8500472aa6ef 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -34,329 +34,6 @@ #include "accessors.h" #include "extent-tree.h" -static struct kmem_cache *btrfs_inode_defrag_cachep; -/* - * when auto defrag is enabled we - * queue up these defrag structs to remember which - * inodes need defragging passes - */ -struct inode_defrag { - struct rb_node rb_node; - /* objectid */ - u64 ino; - /* - * transid where the defrag was added, we search for - * extents newer than this - */ - u64 transid; - - /* root objectid */ - u64 root; - - /* - * The extent size threshold for autodefrag. - * - * This value is different for compressed/non-compressed extents, - * thus needs to be passed from higher layer. - * (aka, inode_should_defrag()) - */ - u32 extent_thresh; -}; - -static int __compare_inode_defrag(struct inode_defrag *defrag1, - struct inode_defrag *defrag2) -{ - if (defrag1->root > defrag2->root) - return 1; - else if (defrag1->root < defrag2->root) - return -1; - else if (defrag1->ino > defrag2->ino) - return 1; - else if (defrag1->ino < defrag2->ino) - return -1; - else - return 0; -} - -/* pop a record for an inode into the defrag tree. The lock - * must be held already - * - * If you're inserting a record for an older transid than an - * existing record, the transid already in the tree is lowered - * - * If an existing record is found the defrag item you - * pass in is freed - */ -static int __btrfs_add_inode_defrag(struct btrfs_inode *inode, - struct inode_defrag *defrag) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct inode_defrag *entry; - struct rb_node **p; - struct rb_node *parent = NULL; - int ret; - - p = &fs_info->defrag_inodes.rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(defrag, entry); - if (ret < 0) - p = &parent->rb_left; - else if (ret > 0) - p = &parent->rb_right; - else { - /* if we're reinserting an entry for - * an old defrag run, make sure to - * lower the transid of our existing record - */ - if (defrag->transid < entry->transid) - entry->transid = defrag->transid; - entry->extent_thresh = min(defrag->extent_thresh, - entry->extent_thresh); - return -EEXIST; - } - } - set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); - rb_link_node(&defrag->rb_node, parent, p); - rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes); - return 0; -} - -static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info) -{ - if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; - - if (btrfs_fs_closing(fs_info)) - return 0; - - return 1; -} - -/* - * insert a defrag record for this inode if auto defrag is - * enabled - */ -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh) -{ - struct btrfs_root *root = inode->root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode_defrag *defrag; - u64 transid; - int ret; - - if (!__need_auto_defrag(fs_info)) - return 0; - - if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) - return 0; - - if (trans) - transid = trans->transid; - else - transid = inode->root->last_trans; - - defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS); - if (!defrag) - return -ENOMEM; - - defrag->ino = btrfs_ino(inode); - defrag->transid = transid; - defrag->root = root->root_key.objectid; - defrag->extent_thresh = extent_thresh; - - spin_lock(&fs_info->defrag_inodes_lock); - if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) { - /* - * If we set IN_DEFRAG flag and evict the inode from memory, - * and then re-read this inode, this new inode doesn't have - * IN_DEFRAG flag. At the case, we may find the existed defrag. - */ - ret = __btrfs_add_inode_defrag(inode, defrag); - if (ret) - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } else { - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - } - spin_unlock(&fs_info->defrag_inodes_lock); - return 0; -} - -/* - * pick the defragable inode that we want, if it doesn't exist, we will get - * the next one. - */ -static struct inode_defrag * -btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino) -{ - struct inode_defrag *entry = NULL; - struct inode_defrag tmp; - struct rb_node *p; - struct rb_node *parent = NULL; - int ret; - - tmp.ino = ino; - tmp.root = root; - - spin_lock(&fs_info->defrag_inodes_lock); - p = fs_info->defrag_inodes.rb_node; - while (p) { - parent = p; - entry = rb_entry(parent, struct inode_defrag, rb_node); - - ret = __compare_inode_defrag(&tmp, entry); - if (ret < 0) - p = parent->rb_left; - else if (ret > 0) - p = parent->rb_right; - else - goto out; - } - - if (parent && __compare_inode_defrag(&tmp, entry) > 0) { - parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; - } -out: - if (entry) - rb_erase(parent, &fs_info->defrag_inodes); - spin_unlock(&fs_info->defrag_inodes_lock); - return entry; -} - -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - struct rb_node *node; - - spin_lock(&fs_info->defrag_inodes_lock); - node = rb_first(&fs_info->defrag_inodes); - while (node) { - rb_erase(node, &fs_info->defrag_inodes); - defrag = rb_entry(node, struct inode_defrag, rb_node); - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - - cond_resched_lock(&fs_info->defrag_inodes_lock); - - node = rb_first(&fs_info->defrag_inodes); - } - spin_unlock(&fs_info->defrag_inodes_lock); -} - -#define BTRFS_DEFRAG_BATCH 1024 - -static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, - struct inode_defrag *defrag) -{ - struct btrfs_root *inode_root; - struct inode *inode; - struct btrfs_ioctl_defrag_range_args range; - int ret = 0; - u64 cur = 0; - -again: - if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state)) - goto cleanup; - if (!__need_auto_defrag(fs_info)) - goto cleanup; - - /* get the inode */ - inode_root = btrfs_get_fs_root(fs_info, defrag->root, true); - if (IS_ERR(inode_root)) { - ret = PTR_ERR(inode_root); - goto cleanup; - } - - inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root); - btrfs_put_root(inode_root); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - goto cleanup; - } - - if (cur >= i_size_read(inode)) { - iput(inode); - goto cleanup; - } - - /* do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); - memset(&range, 0, sizeof(range)); - range.len = (u64)-1; - range.start = cur; - range.extent_thresh = defrag->extent_thresh; - - sb_start_write(fs_info->sb); - ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); - sb_end_write(fs_info->sb); - iput(inode); - - if (ret < 0) - goto cleanup; - - cur = max(cur + fs_info->sectorsize, range.start); - goto again; - -cleanup: - kmem_cache_free(btrfs_inode_defrag_cachep, defrag); - return ret; -} - -/* - * run through the list of inodes in the FS that need - * defragging - */ -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) -{ - struct inode_defrag *defrag; - u64 first_ino = 0; - u64 root_objectid = 0; - - atomic_inc(&fs_info->defrag_running); - while (1) { - /* Pause the auto defragger. */ - if (test_bit(BTRFS_FS_STATE_REMOUNTING, - &fs_info->fs_state)) - break; - - if (!__need_auto_defrag(fs_info)) - break; - - /* find an inode to defrag */ - defrag = btrfs_pick_defrag_inode(fs_info, root_objectid, - first_ino); - if (!defrag) { - if (root_objectid || first_ino) { - root_objectid = 0; - first_ino = 0; - continue; - } else { - break; - } - } - - first_ino = defrag->ino + 1; - root_objectid = defrag->root; - - __btrfs_run_defrag_inode(fs_info, defrag); - } - atomic_dec(&fs_info->defrag_running); - - /* - * during unmount, we use the transaction_wait queue to - * wait for the defragger to stop - */ - wake_up(&fs_info->transaction_wait); - return 0; -} - /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ @@ -4153,23 +3830,6 @@ const struct file_operations btrfs_file_operations = { .remap_file_range = btrfs_remap_file_range, }; -void __cold btrfs_auto_defrag_exit(void) -{ - kmem_cache_destroy(btrfs_inode_defrag_cachep); -} - -int __init btrfs_auto_defrag_init(void) -{ - btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag", - sizeof(struct inode_defrag), 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_inode_defrag_cachep) - return -ENOMEM; - - return 0; -} - int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end) { int ret; From a6a01ca61f4949037fa7b94278ab260eab02a289 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:24 -0400 Subject: [PATCH 106/249] btrfs: move the file defrag code into defrag.c This is the other big portion of defrag code that has existed in ioctl.c. Move it to its new home in defrag.c. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 903 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/ioctl.c | 902 --------------------------------------------- 2 files changed, 903 insertions(+), 902 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 4c8c03185ce7..2c260eef40d4 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -10,6 +10,9 @@ #include "transaction.h" #include "locking.h" #include "accessors.h" +#include "messages.h" +#include "delalloc-space.h" +#include "subpage.h" static struct kmem_cache *btrfs_inode_defrag_cachep; @@ -452,6 +455,906 @@ done: return ret; } +/* + * Defrag specific helper to get an extent map. + * + * Differences between this and btrfs_get_extent() are: + * + * - No extent_map will be added to inode->extent_tree + * To reduce memory usage in the long run. + * + * - Extra optimization to skip file extents older than @newer_than + * By using btrfs_search_forward() we can skip entire file ranges that + * have extents created in past transactions, because btrfs_search_forward() + * will not visit leaves and nodes with a generation smaller than given + * minimal generation threshold (@newer_than). + * + * Return valid em if we find a file extent matching the requirement. + * Return NULL if we can not find a file extent matching the requirement. + * + * Return ERR_PTR() for error. + */ +static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, + u64 start, u64 newer_than) +{ + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path path = { 0 }; + struct extent_map *em; + struct btrfs_key key; + u64 ino = btrfs_ino(inode); + int ret; + + em = alloc_extent_map(); + if (!em) { + ret = -ENOMEM; + goto err; + } + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = start; + + if (newer_than) { + ret = btrfs_search_forward(root, &key, &path, newer_than); + if (ret < 0) + goto err; + /* Can't find anything newer */ + if (ret > 0) + goto not_found; + } else { + ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); + if (ret < 0) + goto err; + } + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { + /* + * If btrfs_search_slot() makes path to point beyond nritems, + * we should not have an empty leaf, as this inode must at + * least have its INODE_ITEM. + */ + ASSERT(btrfs_header_nritems(path.nodes[0])); + path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; + } + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + /* Perfect match, no need to go one slot back */ + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && + key.offset == start) + goto iterate; + + /* We didn't find a perfect match, needs to go one slot back */ + if (path.slots[0] > 0) { + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) + path.slots[0]--; + } + +iterate: + /* Iterate through the path to find a file extent covering @start */ + while (true) { + u64 extent_end; + + if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) + goto next; + + btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); + + /* + * We may go one slot back to INODE_REF/XATTR item, then + * need to go forward until we reach an EXTENT_DATA. + * But we should still has the correct ino as key.objectid. + */ + if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) + goto next; + + /* It's beyond our target range, definitely not extent found */ + if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) + goto not_found; + + /* + * | |<- File extent ->| + * \- start + * + * This means there is a hole between start and key.offset. + */ + if (key.offset > start) { + em->start = start; + em->orig_start = start; + em->block_start = EXTENT_MAP_HOLE; + em->len = key.offset - start; + break; + } + + fi = btrfs_item_ptr(path.nodes[0], path.slots[0], + struct btrfs_file_extent_item); + extent_end = btrfs_file_extent_end(&path); + + /* + * |<- file extent ->| | + * \- start + * + * We haven't reached start, search next slot. + */ + if (extent_end <= start) + goto next; + + /* Now this extent covers @start, convert it to em */ + btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + break; +next: + ret = btrfs_next_item(root, &path); + if (ret < 0) + goto err; + if (ret > 0) + goto not_found; + } + btrfs_release_path(&path); + return em; + +not_found: + btrfs_release_path(&path); + free_extent_map(em); + return NULL; + +err: + btrfs_release_path(&path); + free_extent_map(em); + return ERR_PTR(ret); +} + +static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, + u64 newer_than, bool locked) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; + + /* + * Hopefully we have this extent in the tree already, try without the + * full extent lock. + */ + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, sectorsize); + read_unlock(&em_tree->lock); + + /* + * We can get a merged extent, in that case, we need to re-search + * tree to get the original em for defrag. + * + * If @newer_than is 0 or em::generation < newer_than, we can trust + * this em, as either we don't care about the generation, or the + * merged extent map will be rejected anyway. + */ + if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + newer_than && em->generation >= newer_than) { + free_extent_map(em); + em = NULL; + } + + if (!em) { + struct extent_state *cached = NULL; + u64 end = start + sectorsize - 1; + + /* Get the big lock and read metadata off disk. */ + if (!locked) + lock_extent(io_tree, start, end, &cached); + em = defrag_get_extent(BTRFS_I(inode), start, newer_than); + if (!locked) + unlock_extent(io_tree, start, end, &cached); + + if (IS_ERR(em)) + return NULL; + } + + return em; +} + +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) +{ + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + return BTRFS_MAX_COMPRESSED; + return fs_info->max_extent_size; +} + +static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, + u32 extent_thresh, u64 newer_than, bool locked) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct extent_map *next; + bool ret = false; + + /* This is the last extent */ + if (em->start + em->len >= i_size_read(inode)) + return false; + + /* + * Here we need to pass @newer_then when checking the next extent, or + * we will hit a case we mark current extent for defrag, but the next + * one will not be a target. + * This will just cause extra IO without really reducing the fragments. + */ + next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); + /* No more em or hole */ + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + goto out; + if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + goto out; + /* + * If the next extent is at its max capacity, defragging current extent + * makes no sense, as the total number of extents won't change. + */ + if (next->len >= get_extent_max_capacity(fs_info, em)) + goto out; + /* Skip older extent */ + if (next->generation < newer_than) + goto out; + /* Also check extent size */ + if (next->len >= extent_thresh) + goto out; + + ret = true; +out: + free_extent_map(next); + return ret; +} + +/* + * Prepare one page to be defragged. + * + * This will ensure: + * + * - Returned page is locked and has been set up properly. + * - No ordered extent exists in the page. + * - The page is uptodate. + * + * NOTE: Caller should also wait for page writeback after the cluster is + * prepared, here we don't do writeback wait for each page. + */ +static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->vfs_inode.i_mapping; + gfp_t mask = btrfs_alloc_write_mask(mapping); + u64 page_start = (u64)index << PAGE_SHIFT; + u64 page_end = page_start + PAGE_SIZE - 1; + struct extent_state *cached_state = NULL; + struct page *page; + int ret; + +again: + page = find_or_create_page(mapping, index, mask); + if (!page) + return ERR_PTR(-ENOMEM); + + /* + * Since we can defragment files opened read-only, we can encounter + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We + * can't do I/O using huge pages yet, so return an error for now. + * Filesystem transparent huge pages are typically only used for + * executables that explicitly enable them, so this isn't very + * restrictive. + */ + if (PageCompound(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ETXTBSY); + } + + ret = set_page_extent_mapped(page); + if (ret < 0) { + unlock_page(page); + put_page(page); + return ERR_PTR(ret); + } + + /* Wait for any existing ordered extent in the range */ + while (1) { + struct btrfs_ordered_extent *ordered; + + lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); + unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); + if (!ordered) + break; + + unlock_page(page); + btrfs_start_ordered_extent(ordered, 1); + btrfs_put_ordered_extent(ordered); + lock_page(page); + /* + * We unlocked the page above, so we need check if it was + * released or not. + */ + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + } + + /* + * Now the page range has no ordered extent any more. Read the page to + * make it uptodate. + */ + if (!PageUptodate(page)) { + btrfs_read_folio(NULL, page_folio(page)); + lock_page(page); + if (page->mapping != mapping || !PagePrivate(page)) { + unlock_page(page); + put_page(page); + goto again; + } + if (!PageUptodate(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-EIO); + } + } + return page; +} + +struct defrag_target_range { + struct list_head list; + u64 start; + u64 len; +}; + +/* + * Collect all valid target extents. + * + * @start: file offset to lookup + * @len: length to lookup + * @extent_thresh: file extent size threshold, any extent size >= this value + * will be ignored + * @newer_than: only defrag extents newer than this value + * @do_compress: whether the defrag is doing compression + * if true, @extent_thresh will be ignored and all regular + * file extents meeting @newer_than will be targets. + * @locked: if the range has already held extent lock + * @target_list: list of targets file extents + */ +static int defrag_collect_targets(struct btrfs_inode *inode, + u64 start, u64 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + bool locked, struct list_head *target_list, + u64 *last_scanned_ret) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + bool last_is_target = false; + u64 cur = start; + int ret = 0; + + while (cur < start + len) { + struct extent_map *em; + struct defrag_target_range *new; + bool next_mergeable = true; + u64 range_len; + + last_is_target = false; + em = defrag_lookup_extent(&inode->vfs_inode, cur, newer_than, locked); + if (!em) + break; + + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || + test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + goto next; + + /* Skip older extent */ + if (em->generation < newer_than) + goto next; + + /* This em is under writeback, no need to defrag */ + if (em->generation == (u64)-1) + goto next; + + /* + * Our start offset might be in the middle of an existing extent + * map, so take that into account. + */ + range_len = em->len - (cur - em->start); + /* + * If this range of the extent map is already flagged for delalloc, + * skip it, because: + * + * 1) We could deadlock later, when trying to reserve space for + * delalloc, because in case we can't immediately reserve space + * the flusher can start delalloc and wait for the respective + * ordered extents to complete. The deadlock would happen + * because we do the space reservation while holding the range + * locked, and starting writeback, or finishing an ordered + * extent, requires locking the range; + * + * 2) If there's delalloc there, it means there's dirty pages for + * which writeback has not started yet (we clean the delalloc + * flag when starting writeback and after creating an ordered + * extent). If we mark pages in an adjacent range for defrag, + * then we will have a larger contiguous range for delalloc, + * very likely resulting in a larger extent after writeback is + * triggered (except in a case of free space fragmentation). + */ + if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC, 0, NULL)) + goto next; + + /* + * For do_compress case, we want to compress all valid file + * extents, thus no @extent_thresh or mergeable check. + */ + if (do_compress) + goto add; + + /* Skip too large extent */ + if (range_len >= extent_thresh) + goto next; + + /* + * Skip extents already at its max capacity, this is mostly for + * compressed extents, which max cap is only 128K. + */ + if (em->len >= get_extent_max_capacity(fs_info, em)) + goto next; + + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, + extent_thresh, newer_than, locked); + if (!next_mergeable) { + struct defrag_target_range *last; + + /* Empty target list, no way to merge with last entry */ + if (list_empty(target_list)) + goto next; + last = list_entry(target_list->prev, + struct defrag_target_range, list); + /* Not mergeable with last entry */ + if (last->start + last->len != cur) + goto next; + + /* Mergeable, fall through to add it to @target_list. */ + } + +add: + last_is_target = true; + range_len = min(extent_map_end(em), start + len) - cur; + /* + * This one is a good target, check if it can be merged into + * last range of the target list. + */ + if (!list_empty(target_list)) { + struct defrag_target_range *last; + + last = list_entry(target_list->prev, + struct defrag_target_range, list); + ASSERT(last->start + last->len <= cur); + if (last->start + last->len == cur) { + /* Mergeable, enlarge the last entry */ + last->len += range_len; + goto next; + } + /* Fall through to allocate a new entry */ + } + + /* Allocate new defrag_target_range */ + new = kmalloc(sizeof(*new), GFP_NOFS); + if (!new) { + free_extent_map(em); + ret = -ENOMEM; + break; + } + new->start = cur; + new->len = range_len; + list_add_tail(&new->list, target_list); + +next: + cur = extent_map_end(em); + free_extent_map(em); + } + if (ret < 0) { + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + + list_for_each_entry_safe(entry, tmp, target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + } + if (!ret && last_scanned_ret) { + /* + * If the last extent is not a target, the caller can skip to + * the end of that extent. + * Otherwise, we can only go the end of the specified range. + */ + if (!last_is_target) + *last_scanned_ret = max(cur, *last_scanned_ret); + else + *last_scanned_ret = max(start + len, *last_scanned_ret); + } + return ret; +} + +#define CLUSTER_SIZE (SZ_256K) +static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); + +/* + * Defrag one contiguous target range. + * + * @inode: target inode + * @target: target range to defrag + * @pages: locked pages covering the defrag range + * @nr_pages: number of locked pages + * + * Caller should ensure: + * + * - Pages are prepared + * Pages should be locked, no ordered extent in the pages range, + * no writeback. + * + * - Extent bits are locked + */ +static int defrag_one_locked_target(struct btrfs_inode *inode, + struct defrag_target_range *target, + struct page **pages, int nr_pages, + struct extent_state **cached_state) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_changeset *data_reserved = NULL; + const u64 start = target->start; + const u64 len = target->len; + unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; + unsigned long start_index = start >> PAGE_SHIFT; + unsigned long first_index = page_index(pages[0]); + int ret = 0; + int i; + + ASSERT(last_index - first_index + 1 <= nr_pages); + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); + if (ret < 0) + return ret; + clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); + + /* Update the page status */ + for (i = start_index - first_index; i <= last_index - first_index; i++) { + ClearPageChecked(pages[i]); + btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + } + btrfs_delalloc_release_extents(inode, len); + extent_changeset_free(data_reserved); + + return ret; +} + +static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, + u32 extent_thresh, u64 newer_than, bool do_compress, + u64 *last_scanned_ret) +{ + struct extent_state *cached_state = NULL; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + struct page **pages; + const u32 sectorsize = inode->root->fs_info->sectorsize; + u64 last_index = (start + len - 1) >> PAGE_SHIFT; + u64 start_index = start >> PAGE_SHIFT; + unsigned int nr_pages = last_index - start_index + 1; + int ret = 0; + int i; + + ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); + ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); + + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) + return -ENOMEM; + + /* Prepare all pages */ + for (i = 0; i < nr_pages; i++) { + pages[i] = defrag_prepare_one_page(inode, start_index + i); + if (IS_ERR(pages[i])) { + ret = PTR_ERR(pages[i]); + pages[i] = NULL; + goto free_pages; + } + } + for (i = 0; i < nr_pages; i++) + wait_on_page_writeback(pages[i]); + + /* Lock the pages range */ + lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); + /* + * Now we have a consistent view about the extent map, re-check + * which range really needs to be defragged. + * + * And this time we have extent locked already, pass @locked = true + * so that we won't relock the extent range and cause deadlock. + */ + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, true, + &target_list, last_scanned_ret); + if (ret < 0) + goto unlock_extent; + + list_for_each_entry(entry, &target_list, list) { + ret = defrag_one_locked_target(inode, entry, pages, nr_pages, + &cached_state); + if (ret < 0) + break; + } + + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } +unlock_extent: + unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, + (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, + &cached_state); +free_pages: + for (i = 0; i < nr_pages; i++) { + if (pages[i]) { + unlock_page(pages[i]); + put_page(pages[i]); + } + } + kfree(pages); + return ret; +} + +static int defrag_one_cluster(struct btrfs_inode *inode, + struct file_ra_state *ra, + u64 start, u32 len, u32 extent_thresh, + u64 newer_than, bool do_compress, + unsigned long *sectors_defragged, + unsigned long max_sectors, + u64 *last_scanned_ret) +{ + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct defrag_target_range *entry; + struct defrag_target_range *tmp; + LIST_HEAD(target_list); + int ret; + + ret = defrag_collect_targets(inode, start, len, extent_thresh, + newer_than, do_compress, false, + &target_list, NULL); + if (ret < 0) + goto out; + + list_for_each_entry(entry, &target_list, list) { + u32 range_len = entry->len; + + /* Reached or beyond the limit */ + if (max_sectors && *sectors_defragged >= max_sectors) { + ret = 1; + break; + } + + if (max_sectors) + range_len = min_t(u32, range_len, + (max_sectors - *sectors_defragged) * sectorsize); + + /* + * If defrag_one_range() has updated last_scanned_ret, + * our range may already be invalid (e.g. hole punched). + * Skip if our range is before last_scanned_ret, as there is + * no need to defrag the range anymore. + */ + if (entry->start + range_len <= *last_scanned_ret) + continue; + + if (ra) + page_cache_sync_readahead(inode->vfs_inode.i_mapping, + ra, NULL, entry->start >> PAGE_SHIFT, + ((entry->start + range_len - 1) >> PAGE_SHIFT) - + (entry->start >> PAGE_SHIFT) + 1); + /* + * Here we may not defrag any range if holes are punched before + * we locked the pages. + * But that's fine, it only affects the @sectors_defragged + * accounting. + */ + ret = defrag_one_range(inode, entry->start, range_len, + extent_thresh, newer_than, do_compress, + last_scanned_ret); + if (ret < 0) + break; + *sectors_defragged += range_len >> + inode->root->fs_info->sectorsize_bits; + } +out: + list_for_each_entry_safe(entry, tmp, &target_list, list) { + list_del_init(&entry->list); + kfree(entry); + } + if (ret >= 0) + *last_scanned_ret = max(*last_scanned_ret, start + len); + return ret; +} + +/* + * Entry point to file defragmentation. + * + * @inode: inode to be defragged + * @ra: readahead state (can be NUL) + * @range: defrag options including range and flags + * @newer_than: minimum transid to defrag + * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode + * will be defragged. + * + * Return <0 for error. + * Return >=0 for the number of sectors defragged, and range->start will be updated + * to indicate the file offset where next defrag should be started at. + * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without + * defragging all the range). + */ +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + unsigned long sectors_defragged = 0; + u64 isize = i_size_read(inode); + u64 cur; + u64 last_byte; + bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); + bool ra_allocated = false; + int compress_type = BTRFS_COMPRESS_ZLIB; + int ret = 0; + u32 extent_thresh = range->extent_thresh; + pgoff_t start_index; + + if (isize == 0) + return 0; + + if (range->start >= isize) + return -EINVAL; + + if (do_compress) { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } + + if (extent_thresh == 0) + extent_thresh = SZ_256K; + + if (range->start + range->len > range->start) { + /* Got a specific range */ + last_byte = min(isize, range->start + range->len); + } else { + /* Defrag until file end */ + last_byte = isize; + } + + /* Align the range */ + cur = round_down(range->start, fs_info->sectorsize); + last_byte = round_up(last_byte, fs_info->sectorsize) - 1; + + /* + * If we were not given a ra, allocate a readahead context. As + * readahead is just an optimization, defrag will work without it so + * we don't error out. + */ + if (!ra) { + ra_allocated = true; + ra = kzalloc(sizeof(*ra), GFP_KERNEL); + if (ra) + file_ra_state_init(ra, inode->i_mapping); + } + + /* + * Make writeback start from the beginning of the range, so that the + * defrag range can be written sequentially. + */ + start_index = cur >> PAGE_SHIFT; + if (start_index < inode->i_mapping->writeback_index) + inode->i_mapping->writeback_index = start_index; + + while (cur < last_byte) { + const unsigned long prev_sectors_defragged = sectors_defragged; + u64 last_scanned = cur; + u64 cluster_end; + + if (btrfs_defrag_cancelled(fs_info)) { + ret = -EAGAIN; + break; + } + + /* We want the cluster end at page boundary when possible */ + cluster_end = (((cur >> PAGE_SHIFT) + + (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; + cluster_end = min(cluster_end, last_byte); + + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(inode)) { + ret = -ETXTBSY; + btrfs_inode_unlock(inode, 0); + break; + } + if (!(inode->i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); + break; + } + if (do_compress) + BTRFS_I(inode)->defrag_compress = compress_type; + ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + cluster_end + 1 - cur, extent_thresh, + newer_than, do_compress, §ors_defragged, + max_to_defrag, &last_scanned); + + if (sectors_defragged > prev_sectors_defragged) + balance_dirty_pages_ratelimited(inode->i_mapping); + + btrfs_inode_unlock(inode, 0); + if (ret < 0) + break; + cur = max(cluster_end + 1, last_scanned); + if (ret > 0) { + ret = 0; + break; + } + cond_resched(); + } + + if (ra_allocated) + kfree(ra); + /* + * Update range.start for autodefrag, this will indicate where to start + * in next run. + */ + range->start = cur; + if (sectors_defragged) { + /* + * We have defragged some sectors, for compression case they + * need to be written back immediately. + */ + if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + filemap_flush(inode->i_mapping); + } + if (range->compress_type == BTRFS_COMPRESS_LZO) + btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); + else if (range->compress_type == BTRFS_COMPRESS_ZSTD) + btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); + ret = sectors_defragged; + } + if (do_compress) { + btrfs_inode_lock(inode, 0); + BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); + } + return ret; +} + void __cold btrfs_auto_defrag_exit(void) { kmem_cache_destroy(btrfs_inode_defrag_cachep); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 96858240588d..42453dd2ed31 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1039,908 +1039,6 @@ out: return ret; } -/* - * Defrag specific helper to get an extent map. - * - * Differences between this and btrfs_get_extent() are: - * - * - No extent_map will be added to inode->extent_tree - * To reduce memory usage in the long run. - * - * - Extra optimization to skip file extents older than @newer_than - * By using btrfs_search_forward() we can skip entire file ranges that - * have extents created in past transactions, because btrfs_search_forward() - * will not visit leaves and nodes with a generation smaller than given - * minimal generation threshold (@newer_than). - * - * Return valid em if we find a file extent matching the requirement. - * Return NULL if we can not find a file extent matching the requirement. - * - * Return ERR_PTR() for error. - */ -static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, - u64 start, u64 newer_than) -{ - struct btrfs_root *root = inode->root; - struct btrfs_file_extent_item *fi; - struct btrfs_path path = { 0 }; - struct extent_map *em; - struct btrfs_key key; - u64 ino = btrfs_ino(inode); - int ret; - - em = alloc_extent_map(); - if (!em) { - ret = -ENOMEM; - goto err; - } - - key.objectid = ino; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - if (newer_than) { - ret = btrfs_search_forward(root, &key, &path, newer_than); - if (ret < 0) - goto err; - /* Can't find anything newer */ - if (ret > 0) - goto not_found; - } else { - ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0); - if (ret < 0) - goto err; - } - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) { - /* - * If btrfs_search_slot() makes path to point beyond nritems, - * we should not have an empty leaf, as this inode must at - * least have its INODE_ITEM. - */ - ASSERT(btrfs_header_nritems(path.nodes[0])); - path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1; - } - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - /* Perfect match, no need to go one slot back */ - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY && - key.offset == start) - goto iterate; - - /* We didn't find a perfect match, needs to go one slot back */ - if (path.slots[0] > 0) { - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY) - path.slots[0]--; - } - -iterate: - /* Iterate through the path to find a file extent covering @start */ - while (true) { - u64 extent_end; - - if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) - goto next; - - btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]); - - /* - * We may go one slot back to INODE_REF/XATTR item, then - * need to go forward until we reach an EXTENT_DATA. - * But we should still has the correct ino as key.objectid. - */ - if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY) - goto next; - - /* It's beyond our target range, definitely not extent found */ - if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY) - goto not_found; - - /* - * | |<- File extent ->| - * \- start - * - * This means there is a hole between start and key.offset. - */ - if (key.offset > start) { - em->start = start; - em->orig_start = start; - em->block_start = EXTENT_MAP_HOLE; - em->len = key.offset - start; - break; - } - - fi = btrfs_item_ptr(path.nodes[0], path.slots[0], - struct btrfs_file_extent_item); - extent_end = btrfs_file_extent_end(&path); - - /* - * |<- file extent ->| | - * \- start - * - * We haven't reached start, search next slot. - */ - if (extent_end <= start) - goto next; - - /* Now this extent covers @start, convert it to em */ - btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); - break; -next: - ret = btrfs_next_item(root, &path); - if (ret < 0) - goto err; - if (ret > 0) - goto not_found; - } - btrfs_release_path(&path); - return em; - -not_found: - btrfs_release_path(&path); - free_extent_map(em); - return NULL; - -err: - btrfs_release_path(&path); - free_extent_map(em); - return ERR_PTR(ret); -} - -static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, - u64 newer_than, bool locked) -{ - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct extent_map *em; - const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize; - - /* - * hopefully we have this extent in the tree already, try without - * the full extent lock - */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); - read_unlock(&em_tree->lock); - - /* - * We can get a merged extent, in that case, we need to re-search - * tree to get the original em for defrag. - * - * If @newer_than is 0 or em::generation < newer_than, we can trust - * this em, as either we don't care about the generation, or the - * merged extent map will be rejected anyway. - */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && - newer_than && em->generation >= newer_than) { - free_extent_map(em); - em = NULL; - } - - if (!em) { - struct extent_state *cached = NULL; - u64 end = start + sectorsize - 1; - - /* get the big lock and read metadata off disk */ - if (!locked) - lock_extent(io_tree, start, end, &cached); - em = defrag_get_extent(BTRFS_I(inode), start, newer_than); - if (!locked) - unlock_extent(io_tree, start, end, &cached); - - if (IS_ERR(em)) - return NULL; - } - - return em; -} - -static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, - const struct extent_map *em) -{ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - return BTRFS_MAX_COMPRESSED; - return fs_info->max_extent_size; -} - -static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, - u32 extent_thresh, u64 newer_than, bool locked) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_map *next; - bool ret = false; - - /* this is the last extent */ - if (em->start + em->len >= i_size_read(inode)) - return false; - - /* - * Here we need to pass @newer_then when checking the next extent, or - * we will hit a case we mark current extent for defrag, but the next - * one will not be a target. - * This will just cause extra IO without really reducing the fragments. - */ - next = defrag_lookup_extent(inode, em->start + em->len, newer_than, locked); - /* No more em or hole */ - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) - goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) - goto out; - /* - * If the next extent is at its max capacity, defragging current extent - * makes no sense, as the total number of extents won't change. - */ - if (next->len >= get_extent_max_capacity(fs_info, em)) - goto out; - /* Skip older extent */ - if (next->generation < newer_than) - goto out; - /* Also check extent size */ - if (next->len >= extent_thresh) - goto out; - - ret = true; -out: - free_extent_map(next); - return ret; -} - -/* - * Prepare one page to be defragged. - * - * This will ensure: - * - * - Returned page is locked and has been set up properly. - * - No ordered extent exists in the page. - * - The page is uptodate. - * - * NOTE: Caller should also wait for page writeback after the cluster is - * prepared, here we don't do writeback wait for each page. - */ -static struct page *defrag_prepare_one_page(struct btrfs_inode *inode, - pgoff_t index) -{ - struct address_space *mapping = inode->vfs_inode.i_mapping; - gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; - struct extent_state *cached_state = NULL; - struct page *page; - int ret; - -again: - page = find_or_create_page(mapping, index, mask); - if (!page) - return ERR_PTR(-ENOMEM); - - /* - * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. - * Filesystem transparent huge pages are typically only used for - * executables that explicitly enable them, so this isn't very - * restrictive. - */ - if (PageCompound(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-ETXTBSY); - } - - ret = set_page_extent_mapped(page); - if (ret < 0) { - unlock_page(page); - put_page(page); - return ERR_PTR(ret); - } - - /* Wait for any existing ordered extent in the range */ - while (1) { - struct btrfs_ordered_extent *ordered; - - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); - if (!ordered) - break; - - unlock_page(page); - btrfs_start_ordered_extent(ordered, 1); - btrfs_put_ordered_extent(ordered); - lock_page(page); - /* - * We unlocked the page above, so we need check if it was - * released or not. - */ - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - } - - /* - * Now the page range has no ordered extent any more. Read the page to - * make it uptodate. - */ - if (!PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != mapping || !PagePrivate(page)) { - unlock_page(page); - put_page(page); - goto again; - } - if (!PageUptodate(page)) { - unlock_page(page); - put_page(page); - return ERR_PTR(-EIO); - } - } - return page; -} - -struct defrag_target_range { - struct list_head list; - u64 start; - u64 len; -}; - -/* - * Collect all valid target extents. - * - * @start: file offset to lookup - * @len: length to lookup - * @extent_thresh: file extent size threshold, any extent size >= this value - * will be ignored - * @newer_than: only defrag extents newer than this value - * @do_compress: whether the defrag is doing compression - * if true, @extent_thresh will be ignored and all regular - * file extents meeting @newer_than will be targets. - * @locked: if the range has already held extent lock - * @target_list: list of targets file extents - */ -static int defrag_collect_targets(struct btrfs_inode *inode, - u64 start, u64 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - bool locked, struct list_head *target_list, - u64 *last_scanned_ret) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - bool last_is_target = false; - u64 cur = start; - int ret = 0; - - while (cur < start + len) { - struct extent_map *em; - struct defrag_target_range *new; - bool next_mergeable = true; - u64 range_len; - - last_is_target = false; - em = defrag_lookup_extent(&inode->vfs_inode, cur, - newer_than, locked); - if (!em) - break; - - /* - * If the file extent is an inlined one, we may still want to - * defrag it (fallthrough) if it will cause a regular extent. - * This is for users who want to convert inline extents to - * regular ones through max_inline= mount option. - */ - if (em->block_start == EXTENT_MAP_INLINE && - em->len <= inode->root->fs_info->max_inline) - goto next; - - /* Skip hole/delalloc/preallocated extents */ - if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - goto next; - - /* Skip older extent */ - if (em->generation < newer_than) - goto next; - - /* This em is under writeback, no need to defrag */ - if (em->generation == (u64)-1) - goto next; - - /* - * Our start offset might be in the middle of an existing extent - * map, so take that into account. - */ - range_len = em->len - (cur - em->start); - /* - * If this range of the extent map is already flagged for delalloc, - * skip it, because: - * - * 1) We could deadlock later, when trying to reserve space for - * delalloc, because in case we can't immediately reserve space - * the flusher can start delalloc and wait for the respective - * ordered extents to complete. The deadlock would happen - * because we do the space reservation while holding the range - * locked, and starting writeback, or finishing an ordered - * extent, requires locking the range; - * - * 2) If there's delalloc there, it means there's dirty pages for - * which writeback has not started yet (we clean the delalloc - * flag when starting writeback and after creating an ordered - * extent). If we mark pages in an adjacent range for defrag, - * then we will have a larger contiguous range for delalloc, - * very likely resulting in a larger extent after writeback is - * triggered (except in a case of free space fragmentation). - */ - if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC, 0, NULL)) - goto next; - - /* - * For do_compress case, we want to compress all valid file - * extents, thus no @extent_thresh or mergeable check. - */ - if (do_compress) - goto add; - - /* Skip too large extent */ - if (range_len >= extent_thresh) - goto next; - - /* - * Skip extents already at its max capacity, this is mostly for - * compressed extents, which max cap is only 128K. - */ - if (em->len >= get_extent_max_capacity(fs_info, em)) - goto next; - - /* - * Normally there are no more extents after an inline one, thus - * @next_mergeable will normally be false and not defragged. - * So if an inline extent passed all above checks, just add it - * for defrag, and be converted to regular extents. - */ - if (em->block_start == EXTENT_MAP_INLINE) - goto add; - - next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, - extent_thresh, newer_than, locked); - if (!next_mergeable) { - struct defrag_target_range *last; - - /* Empty target list, no way to merge with last entry */ - if (list_empty(target_list)) - goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); - /* Not mergeable with last entry */ - if (last->start + last->len != cur) - goto next; - - /* Mergeable, fall through to add it to @target_list. */ - } - -add: - last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; - /* - * This one is a good target, check if it can be merged into - * last range of the target list. - */ - if (!list_empty(target_list)) { - struct defrag_target_range *last; - - last = list_entry(target_list->prev, - struct defrag_target_range, list); - ASSERT(last->start + last->len <= cur); - if (last->start + last->len == cur) { - /* Mergeable, enlarge the last entry */ - last->len += range_len; - goto next; - } - /* Fall through to allocate a new entry */ - } - - /* Allocate new defrag_target_range */ - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) { - free_extent_map(em); - ret = -ENOMEM; - break; - } - new->start = cur; - new->len = range_len; - list_add_tail(&new->list, target_list); - -next: - cur = extent_map_end(em); - free_extent_map(em); - } - if (ret < 0) { - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - - list_for_each_entry_safe(entry, tmp, target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - } - if (!ret && last_scanned_ret) { - /* - * If the last extent is not a target, the caller can skip to - * the end of that extent. - * Otherwise, we can only go the end of the specified range. - */ - if (!last_is_target) - *last_scanned_ret = max(cur, *last_scanned_ret); - else - *last_scanned_ret = max(start + len, *last_scanned_ret); - } - return ret; -} - -#define CLUSTER_SIZE (SZ_256K) -static_assert(IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE)); - -/* - * Defrag one contiguous target range. - * - * @inode: target inode - * @target: target range to defrag - * @pages: locked pages covering the defrag range - * @nr_pages: number of locked pages - * - * Caller should ensure: - * - * - Pages are prepared - * Pages should be locked, no ordered extent in the pages range, - * no writeback. - * - * - Extent bits are locked - */ -static int defrag_one_locked_target(struct btrfs_inode *inode, - struct defrag_target_range *target, - struct page **pages, int nr_pages, - struct extent_state **cached_state) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct extent_changeset *data_reserved = NULL; - const u64 start = target->start; - const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = page_index(pages[0]); - int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); - - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); - if (ret < 0) - return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); - } - btrfs_delalloc_release_extents(inode, len); - extent_changeset_free(data_reserved); - - return ret; -} - -static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, - u32 extent_thresh, u64 newer_than, bool do_compress, - u64 *last_scanned_ret) -{ - struct extent_state *cached_state = NULL; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - struct page **pages; - const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; - int ret = 0; - int i; - - ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); - ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); - - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); - if (!pages) - return -ENOMEM; - - /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - pages[i] = defrag_prepare_one_page(inode, start_index + i); - if (IS_ERR(pages[i])) { - ret = PTR_ERR(pages[i]); - pages[i] = NULL; - goto free_pages; - } - } - for (i = 0; i < nr_pages; i++) - wait_on_page_writeback(pages[i]); - - /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); - /* - * Now we have a consistent view about the extent map, re-check - * which range really needs to be defragged. - * - * And this time we have extent locked already, pass @locked = true - * so that we won't relock the extent range and cause deadlock. - */ - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, true, - &target_list, last_scanned_ret); - if (ret < 0) - goto unlock_extent; - - list_for_each_entry(entry, &target_list, list) { - ret = defrag_one_locked_target(inode, entry, pages, nr_pages, - &cached_state); - if (ret < 0) - break; - } - - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } -unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); -free_pages: - for (i = 0; i < nr_pages; i++) { - if (pages[i]) { - unlock_page(pages[i]); - put_page(pages[i]); - } - } - kfree(pages); - return ret; -} - -static int defrag_one_cluster(struct btrfs_inode *inode, - struct file_ra_state *ra, - u64 start, u32 len, u32 extent_thresh, - u64 newer_than, bool do_compress, - unsigned long *sectors_defragged, - unsigned long max_sectors, - u64 *last_scanned_ret) -{ - const u32 sectorsize = inode->root->fs_info->sectorsize; - struct defrag_target_range *entry; - struct defrag_target_range *tmp; - LIST_HEAD(target_list); - int ret; - - ret = defrag_collect_targets(inode, start, len, extent_thresh, - newer_than, do_compress, false, - &target_list, NULL); - if (ret < 0) - goto out; - - list_for_each_entry(entry, &target_list, list) { - u32 range_len = entry->len; - - /* Reached or beyond the limit */ - if (max_sectors && *sectors_defragged >= max_sectors) { - ret = 1; - break; - } - - if (max_sectors) - range_len = min_t(u32, range_len, - (max_sectors - *sectors_defragged) * sectorsize); - - /* - * If defrag_one_range() has updated last_scanned_ret, - * our range may already be invalid (e.g. hole punched). - * Skip if our range is before last_scanned_ret, as there is - * no need to defrag the range anymore. - */ - if (entry->start + range_len <= *last_scanned_ret) - continue; - - if (ra) - page_cache_sync_readahead(inode->vfs_inode.i_mapping, - ra, NULL, entry->start >> PAGE_SHIFT, - ((entry->start + range_len - 1) >> PAGE_SHIFT) - - (entry->start >> PAGE_SHIFT) + 1); - /* - * Here we may not defrag any range if holes are punched before - * we locked the pages. - * But that's fine, it only affects the @sectors_defragged - * accounting. - */ - ret = defrag_one_range(inode, entry->start, range_len, - extent_thresh, newer_than, do_compress, - last_scanned_ret); - if (ret < 0) - break; - *sectors_defragged += range_len >> - inode->root->fs_info->sectorsize_bits; - } -out: - list_for_each_entry_safe(entry, tmp, &target_list, list) { - list_del_init(&entry->list); - kfree(entry); - } - if (ret >= 0) - *last_scanned_ret = max(*last_scanned_ret, start + len); - return ret; -} - -/* - * Entry point to file defragmentation. - * - * @inode: inode to be defragged - * @ra: readahead state (can be NUL) - * @range: defrag options including range and flags - * @newer_than: minimum transid to defrag - * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode - * will be defragged. - * - * Return <0 for error. - * Return >=0 for the number of sectors defragged, and range->start will be updated - * to indicate the file offset where next defrag should be started at. - * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without - * defragging all the range). - */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); - u64 cur; - u64 last_byte; - bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS; - bool ra_allocated = false; - int compress_type = BTRFS_COMPRESS_ZLIB; - int ret = 0; - u32 extent_thresh = range->extent_thresh; - pgoff_t start_index; - - if (isize == 0) - return 0; - - if (range->start >= isize) - return -EINVAL; - - if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; - } - - if (extent_thresh == 0) - extent_thresh = SZ_256K; - - if (range->start + range->len > range->start) { - /* Got a specific range */ - last_byte = min(isize, range->start + range->len); - } else { - /* Defrag until file end */ - last_byte = isize; - } - - /* Align the range */ - cur = round_down(range->start, fs_info->sectorsize); - last_byte = round_up(last_byte, fs_info->sectorsize) - 1; - - /* - * If we were not given a ra, allocate a readahead context. As - * readahead is just an optimization, defrag will work without it so - * we don't error out. - */ - if (!ra) { - ra_allocated = true; - ra = kzalloc(sizeof(*ra), GFP_KERNEL); - if (ra) - file_ra_state_init(ra, inode->i_mapping); - } - - /* - * Make writeback start from the beginning of the range, so that the - * defrag range can be written sequentially. - */ - start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; - - while (cur < last_byte) { - const unsigned long prev_sectors_defragged = sectors_defragged; - u64 last_scanned = cur; - u64 cluster_end; - - if (btrfs_defrag_cancelled(fs_info)) { - ret = -EAGAIN; - break; - } - - /* We want the cluster end at page boundary when possible */ - cluster_end = (((cur >> PAGE_SHIFT) + - (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; - cluster_end = min(cluster_end, last_byte); - - btrfs_inode_lock(inode, 0); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - btrfs_inode_unlock(inode, 0); - break; - } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(inode, 0); - break; - } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, - cluster_end + 1 - cur, extent_thresh, - newer_than, do_compress, §ors_defragged, - max_to_defrag, &last_scanned); - - if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); - - btrfs_inode_unlock(inode, 0); - if (ret < 0) - break; - cur = max(cluster_end + 1, last_scanned); - if (ret > 0) { - ret = 0; - break; - } - cond_resched(); - } - - if (ra_allocated) - kfree(ra); - /* - * Update range.start for autodefrag, this will indicate where to start - * in next run. - */ - range->start = cur; - if (sectors_defragged) { - /* - * We have defragged some sectors, for compression case they - * need to be written back immediately. - */ - if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } - if (range->compress_type == BTRFS_COMPRESS_LZO) - btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); - else if (range->compress_type == BTRFS_COMPRESS_ZSTD) - btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD); - ret = sectors_defragged; - } - if (do_compress) { - btrfs_inode_lock(inode, 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(inode, 0); - } - return ret; -} - /* * Try to start exclusive operation @type or cancel it if it's running. * From 59b818e064ab9051cd344b82420307e772d6bca7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:25 -0400 Subject: [PATCH 107/249] btrfs: move defrag related prototypes to their own header Now that the defrag code is all in one file, create a defrag.h and move all the defrag related prototypes and helper out of ctree.h and into defrag.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 18 ------------------ fs/btrfs/defrag.c | 1 + fs/btrfs/defrag.h | 22 ++++++++++++++++++++++ fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + 8 files changed, 28 insertions(+), 18 deletions(-) create mode 100644 fs/btrfs/defrag.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9267b117860..689220637d00 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -773,19 +773,10 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, - struct btrfs_ioctl_defrag_range_args *range, - u64 newer_than, unsigned long max_to_defrag); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); /* file.c */ -int __init btrfs_auto_defrag_init(void); -void __cold btrfs_auto_defrag_exit(void); -int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u32 extent_thresh); -int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); -void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, @@ -811,10 +802,6 @@ void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, u64 *delalloc_start_ret, u64 *delalloc_end_ret); -/* tree-defrag.c */ -int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, - struct btrfs_root *root); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); @@ -945,11 +932,6 @@ static inline int is_fstree(u64 rootid) return 0; } -static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) -{ - return signal_pending(current); -} - /* verity.c */ #ifdef CONFIG_FS_VERITY diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 2c260eef40d4..291263f93e47 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -13,6 +13,7 @@ #include "messages.h" #include "delalloc-space.h" #include "subpage.h" +#include "defrag.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h new file mode 100644 index 000000000000..5305f2283b5e --- /dev/null +++ b/fs/btrfs/defrag.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DEFRAG_H +#define BTRFS_DEFRAG_H + +int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag); +int __init btrfs_auto_defrag_init(void); +void __cold btrfs_auto_defrag_exit(void); +int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u32 extent_thresh); +int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); +void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root); + +static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) +{ + return signal_pending(current); +} + +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ebc9baab0e47..2f7dd79a5a35 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -47,6 +47,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b9857e30a08..97f28e37ec7c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -59,6 +59,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 42453dd2ed31..f91dbd1dc6d1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -54,6 +54,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index a4030dfeb2f2..f105d360d6c9 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -52,6 +52,7 @@ #include "raid56.h" #include "fs.h" #include "accessors.h" +#include "defrag.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 82b2e2ec90cf..99d3f2a66227 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -28,6 +28,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "defrag.h" static struct kmem_cache *btrfs_trans_handle_cachep; From f2b39277b87dbb26f15e2e3b667df25dad3dd2ea Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:26 -0400 Subject: [PATCH 108/249] btrfs: move dir-item prototypes into dir-item.h Move these prototypes out of ctree.h and into their own header file. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 38 -------------------------------------- fs/btrfs/dir-item.c | 1 + fs/btrfs/dir-item.h | 42 ++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/tree-log.c | 1 + fs/btrfs/xattr.c | 1 + 10 files changed, 50 insertions(+), 38 deletions(-) create mode 100644 fs/btrfs/dir-item.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 689220637d00..a9666dc95105 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -689,44 +689,6 @@ int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, u64 subid); int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); -/* dir-item.c */ -int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, - const struct fscrypt_str *name); -int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, - const struct fscrypt_str *name, struct btrfs_inode *dir, - struct btrfs_key *location, u8 type, u64 index); -struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const struct fscrypt_str *name, int mod); -struct btrfs_dir_item * -btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - u64 index, const struct fscrypt_str *name, int mod); -struct btrfs_dir_item * -btrfs_search_dir_index_item(struct btrfs_root *root, - struct btrfs_path *path, u64 dirid, - const struct fscrypt_str *name); -int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_dir_item *di); -int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - const char *name, u16 name_len, - const void *data, u16 data_len); -struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 dir, - const char *name, u16 name_len, - int mod); -struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - const char *name, - int name_len); - /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ca69fb35a2cc..082eb0e19598 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "transaction.h" #include "accessors.h" +#include "dir-item.h" /* * insert a name into a directory, doing overflow properly if there is a hash diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h new file mode 100644 index 000000000000..aab4b7cc7fa0 --- /dev/null +++ b/fs/btrfs/dir-item.h @@ -0,0 +1,42 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_DIR_ITEM_H +#define BTRFS_DIR_ITEM_H + +int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, + const struct fscrypt_str *name); +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + const struct fscrypt_str *name, struct btrfs_inode *dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_lookup_dir_index_item( + struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 index, const struct fscrypt_str *name, int mod); +struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root, + struct btrfs_path *path, u64 dirid, + const struct fscrypt_str *name); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + const char *name, + int name_len); + +#endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 97f28e37ec7c..a3130e7c803d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -60,6 +60,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "dir-item.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f91dbd1dc6d1..41599b068026 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -55,6 +55,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "dir-item.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 2ece3a030a66..0d3f25d4f147 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -28,6 +28,7 @@ #include "xattr.h" #include "print-tree.h" #include "accessors.h" +#include "dir-item.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f105d360d6c9..9756b0cda626 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -53,6 +53,7 @@ #include "fs.h" #include "accessors.h" #include "defrag.h" +#include "dir-item.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 99d3f2a66227..a0b7702d980f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -29,6 +29,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "dir-item.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e358df3668e7..c168db7519e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -25,6 +25,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "dir-item.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fcf2d5f7e198..0ed4b119a7ca 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -22,6 +22,7 @@ #include "props.h" #include "locking.h" #include "accessors.h" +#include "dir-item.h" int btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size) From 7c8ede16280586c72c36af6604985d714b84a32c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:27 -0400 Subject: [PATCH 109/249] btrfs: move file-item prototypes into their own header Move these prototypes out of ctree.h and into file-item.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 + fs/btrfs/ctree.h | 31 ------------------------------- fs/btrfs/defrag.c | 1 + fs/btrfs/delayed-inode.c | 1 + fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file-item.h | 35 +++++++++++++++++++++++++++++++++++ fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/inode-item.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/tree-log.c | 1 + 17 files changed, 50 insertions(+), 31 deletions(-) create mode 100644 fs/btrfs/file-item.h diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c0615af0434f..61828f8375e6 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -34,6 +34,7 @@ #include "extent_map.h" #include "subpage.h" #include "zoned.h" +#include "file-item.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index a9666dc95105..6bfea55c82a0 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -695,37 +695,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* file-item.c */ -int btrfs_del_csums(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); -int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 objectid, u64 pos, - u64 num_bytes); -int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, u64 objectid, - u64 bytenr, int mod); -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 offset, bool one_ordered); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); -void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, - const struct btrfs_path *path, - struct btrfs_file_extent_item *fi, - const bool new_inline, - struct extent_map *em); -int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, - u64 len); -void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); -u64 btrfs_file_extent_end(const struct btrfs_path *path); - /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 291263f93e47..ed82085acea3 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -14,6 +14,7 @@ #include "delalloc-space.h" #include "subpage.h" #include "defrag.h" +#include "file-item.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index f93d2695e423..c024f97de9e0 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -18,6 +18,7 @@ #include "inode-item.h" #include "space-info.h" #include "accessors.h" +#include "file-item.h" #define BTRFS_DELAYED_WRITEBACK 512 #define BTRFS_DELAYED_BACKGROUND 128 diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 510be00da406..940d4fe23cfb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -40,6 +40,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "file-item.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 545d9e1f0f83..ea31a326ae93 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -32,6 +32,7 @@ #include "compression.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 403a857d230e..20d88cd0b602 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -19,6 +19,7 @@ #include "compression.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h new file mode 100644 index 000000000000..51cd3ab5948c --- /dev/null +++ b/fs/btrfs/file-item.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_ITEM_H +#define BTRFS_FILE_ITEM_H + +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); +int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 pos, + u64 num_bytes); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, + u64 offset, bool one_ordered); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, + const struct btrfs_path *path, + struct btrfs_file_extent_item *fi, + const bool new_inline, + struct extent_map *em); +int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, + u64 len); +int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); +void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size); +u64 btrfs_file_extent_end(const struct btrfs_path *path); + +#endif diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8500472aa6ef..9c100198dd26 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -33,6 +33,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "file-item.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 599b41523479..bc1b9aa164ec 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -27,6 +27,7 @@ #include "subpage.h" #include "inode-item.h" #include "accessors.h" +#include "file-item.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 724507ce7a7d..b65c45b5d681 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -13,6 +13,7 @@ #include "space-info.h" #include "accessors.h" #include "extent-tree.h" +#include "file-item.h" struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf, int slot, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a3130e7c803d..7231f6b69096 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -61,6 +61,7 @@ #include "root-tree.h" #include "defrag.h" #include "dir-item.h" +#include "file-item.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9ba7914c7169..204bb884ab91 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -12,6 +12,7 @@ #include "transaction.h" #include "subpage.h" #include "accessors.h" +#include "file-item.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e345cc71da15..e86364bdac8e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -32,6 +32,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "file-item.h" /* * Relocation overview diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3aed760e1a80..e46e6c4d4bf9 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -23,6 +23,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0d3f25d4f147..4cc9e855a769 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -29,6 +29,7 @@ #include "print-tree.h" #include "accessors.h" #include "dir-item.h" +#include "file-item.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c168db7519e7..7be540fb5c4a 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -26,6 +26,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "dir-item.h" +#include "file-item.h" #define MAX_CONFLICT_INODES 10 From c7a03b524d30382ed45883cd27906678e6a0dada Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:28 -0400 Subject: [PATCH 110/249] btrfs: move uuid tree prototypes to uuid-tree.h Move these out of ctree.h into uuid-tree.h to cut down on the code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ------- fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/uuid-tree.c | 1 + fs/btrfs/uuid-tree.h | 12 ++++++++++++ fs/btrfs/volumes.c | 1 + 8 files changed, 18 insertions(+), 7 deletions(-) create mode 100644 fs/btrfs/uuid-tree.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6bfea55c82a0..910a23e7cd8f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -682,13 +682,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *node, struct extent_buffer *parent); -/* uuid-tree.c */ -int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, - u64 subid); -int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); - /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f7dd79a5a35..e8082e3c5bdc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -48,6 +48,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "defrag.h" +#include "uuid-tree.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7231f6b69096..061fa0923772 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,6 +62,7 @@ #include "defrag.h" #include "dir-item.h" #include "file-item.h" +#include "uuid-tree.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 41599b068026..c2439aebebd4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -56,6 +56,7 @@ #include "root-tree.h" #include "defrag.h" #include "dir-item.h" +#include "uuid-tree.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a0b7702d980f..c6fe2ab8716b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -30,6 +30,7 @@ #include "root-tree.h" #include "defrag.h" #include "dir-item.h" +#include "uuid-tree.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 70304b89f31f..7c7001f42b14 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -12,6 +12,7 @@ #include "print-tree.h" #include "fs.h" #include "accessors.h" +#include "uuid-tree.h" static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key) { diff --git a/fs/btrfs/uuid-tree.h b/fs/btrfs/uuid-tree.h new file mode 100644 index 000000000000..5350c87fe2ca --- /dev/null +++ b/fs/btrfs/uuid-tree.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_UUID_TREE_H +#define BTRFS_UUID_TREE_H + +int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_remove(struct btrfs_trans_handle *trans, u8 *uuid, u8 type, + u64 subid); +int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index f61bb79d4a7e..55164b2dba2a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -35,6 +35,7 @@ #include "zoned.h" #include "fs.h" #include "accessors.h" +#include "uuid-tree.h" static struct bio_set btrfs_bioset; From 7572dec8f5223186ed0fa7f6da47dba98651cbf4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:29 -0400 Subject: [PATCH 111/249] btrfs: move ioctl prototypes into ioctl.h Move these out of ctree.h into ioctl.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 12 ------------ fs/btrfs/file.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/ioctl.h | 17 +++++++++++++++++ fs/btrfs/send.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/volumes.c | 1 + 10 files changed, 25 insertions(+), 12 deletions(-) create mode 100644 fs/btrfs/ioctl.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 910a23e7cd8f..87930337b301 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -688,18 +688,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* ioctl.c */ -long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int btrfs_fileattr_set(struct user_namespace *mnt_userns, - struct dentry *dentry, struct fileattr *fa); -int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); -int __pure btrfs_is_empty_uuid(u8 *uuid); -void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, - struct btrfs_ioctl_balance_args *bargs); - /* file.c */ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); extern const struct file_operations btrfs_file_operations; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9c100198dd26..22063340a4c2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -34,6 +34,7 @@ #include "accessors.h" #include "extent-tree.h" #include "file-item.h" +#include "ioctl.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 061fa0923772..7111eec812dc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -63,6 +63,7 @@ #include "dir-item.h" #include "file-item.h" #include "uuid-tree.h" +#include "ioctl.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c2439aebebd4..4c5c2f890adc 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -57,6 +57,7 @@ #include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" +#include "ioctl.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 000000000000..8a855d5ac2fa --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_IOCTL_H +#define BTRFS_IOCTL_H + +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); +int btrfs_ioctl_get_supported_features(void __user *arg); +void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +int __pure btrfs_is_empty_uuid(u8 *uuid); +void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, + struct btrfs_ioctl_balance_args *bargs); + +#endif diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4cc9e855a769..e9b7deccc5fe 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -30,6 +30,7 @@ #include "accessors.h" #include "dir-item.h" #include "file-item.h" +#include "ioctl.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9756b0cda626..2cd05246bcd3 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -54,6 +54,7 @@ #include "accessors.h" #include "defrag.h" #include "dir-item.h" +#include "ioctl.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c6fe2ab8716b..bbc3ad3e297c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -31,6 +31,7 @@ #include "defrag.h" #include "dir-item.h" #include "uuid-tree.h" +#include "ioctl.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index d02fa354fc2b..00ba5143a17d 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -18,6 +18,7 @@ #include "locking.h" #include "fs.h" #include "accessors.h" +#include "ioctl.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 55164b2dba2a..823de2f29289 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -36,6 +36,7 @@ #include "fs.h" #include "accessors.h" #include "uuid-tree.h" +#include "ioctl.h" static struct bio_set btrfs_bioset; From af142b6f44d36c4d0e1e53acbedbc30a588c58de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:30 -0400 Subject: [PATCH 112/249] btrfs: move file prototypes to file.h Move these out of ctree.h into file.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 26 -------------------------- fs/btrfs/extent_io.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/file.h | 32 ++++++++++++++++++++++++++++++++ fs/btrfs/free-space-cache.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/ordered-data.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/tree-log.c | 1 + 10 files changed, 40 insertions(+), 26 deletions(-) create mode 100644 fs/btrfs/file.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 87930337b301..3a46b5b688e3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -688,32 +688,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* file.c */ -int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); -extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct btrfs_inode *inode, - struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct btrfs_inode *inode, - struct btrfs_path *path, const u64 start, - const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out); -int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode, u64 start, u64 end); -ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, - const struct btrfs_ioctl_encoded_io_args *encoded); -int btrfs_release_file(struct inode *inode, struct file *file); -int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, - size_t num_pages, loff_t pos, size_t write_bytes, - struct extent_state **cached, bool noreserve); -int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait); -void btrfs_check_nocow_unlock(struct btrfs_inode *inode); -bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, - u64 *delalloc_start_ret, u64 *delalloc_end_ret); - /* super.c */ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, unsigned long new_flags); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ea31a326ae93..7fe637408e98 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -33,6 +33,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "file.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 22063340a4c2..b94dc4b2c486 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -35,6 +35,7 @@ #include "extent-tree.h" #include "file-item.h" #include "ioctl.h" +#include "file.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h new file mode 100644 index 000000000000..f3d794e33d46 --- /dev/null +++ b/fs/btrfs/file.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_FILE_H +#define BTRFS_FILE_H + +extern const struct file_operations btrfs_file_operations; + +int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_inode *inode, + struct btrfs_drop_extents_args *args); +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); +int btrfs_release_file(struct inode *inode, struct file *file); +int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, + size_t num_pages, loff_t pos, size_t write_bytes, + struct extent_state **cached, bool noreserve); +int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes, bool nowait); +void btrfs_check_nocow_unlock(struct btrfs_inode *inode); +bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + u64 *delalloc_start_ret, u64 *delalloc_end_ret); + +#endif diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index bc1b9aa164ec..aef075b63188 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -28,6 +28,7 @@ #include "inode-item.h" #include "accessors.h" #include "file-item.h" +#include "file.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7111eec812dc..3fe3301b88b3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,6 +64,7 @@ #include "file-item.h" #include "uuid-tree.h" #include "ioctl.h" +#include "file.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 4c5c2f890adc..dba1f07b4194 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -58,6 +58,7 @@ #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" +#include "file.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1c36407803ca..2aea2a17ed95 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -18,6 +18,7 @@ #include "delalloc-space.h" #include "qgroup.h" #include "subpage.h" +#include "file.h" static struct kmem_cache *btrfs_ordered_extent_cache; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 204bb884ab91..3c962b5d8dbd 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -13,6 +13,7 @@ #include "subpage.h" #include "accessors.h" #include "file-item.h" +#include "file.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7be540fb5c4a..1c505713511c 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -27,6 +27,7 @@ #include "root-tree.h" #include "dir-item.h" #include "file-item.h" +#include "file.h" #define MAX_CONFLICT_INODES 10 From b538a271ae9bd07e926b1e3bbfcd1aebf63e5860 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:31 -0400 Subject: [PATCH 113/249] btrfs: move the 32bit warn defines into messages.h The code for these functions are in messages.c, move the defines and prototypes to messages.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 13 ------------- fs/btrfs/messages.h | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3a46b5b688e3..ae2af0aeebc4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -695,19 +695,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); -#if BITS_PER_LONG == 32 -#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) -/* - * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical - * addresses of extents. - * - * For 4K page size it's about 10T, for 64K it's 160T. - */ -#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) -void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); -void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); -#endif - /* * Get the correct offset inside the page of extent buffer. * diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 3772358f8d30..5ac410e313e4 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -229,4 +229,17 @@ do { \ BUG(); \ } while (0) +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + #endif From cc68414c6123fcc436c96d8920f48120b0ca2613 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:32 -0400 Subject: [PATCH 114/249] btrfs: move the snapshot drop related prototypes to extent-tree.h These belong in extent-tree.h, they were missed because they were not grouped with the other extent-tree.c prototypes. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ fs/btrfs/extent-tree.h | 6 ++++++ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index ae2af0aeebc4..dcbfb1b9d269 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -675,12 +675,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) return btrfs_next_old_item(root, p, 0); } int btrfs_leaf_free_space(struct extent_buffer *leaf); -int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, - int for_reloc); -int btrfs_drop_subtree(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *node, - struct extent_buffer *parent); /* orphan.c */ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index b3674b008d58..ae5425253603 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -68,5 +68,11 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); +int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, + int for_reloc); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); #endif From 33cf97a7b6585efc57c1277b9e346afa7c542999 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:33 -0400 Subject: [PATCH 115/249] btrfs: move acl prototypes into acl.h Move these out of ctree.h into acl.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/acl.c | 2 +- fs/btrfs/acl.h | 27 +++++++++++++++++++++++++++ fs/btrfs/ctree.h | 18 ------------------ fs/btrfs/inode.c | 1 + 4 files changed, 29 insertions(+), 19 deletions(-) create mode 100644 fs/btrfs/acl.h diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 548d6a5477b4..100bae33c677 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -11,10 +11,10 @@ #include #include #include - #include "ctree.h" #include "btrfs_inode.h" #include "xattr.h" +#include "acl.h" struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) { diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h new file mode 100644 index 000000000000..45197b4f73bf --- /dev/null +++ b/fs/btrfs/acl.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ACL_H +#define BTRFS_ACL_H + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + +struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); +int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, + struct posix_acl *acl, int type); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); + +#else + +#define btrfs_get_acl NULL +#define btrfs_set_acl NULL +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) +{ + return -EOPNOTSUPP; +} + +#endif + +#endif diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index dcbfb1b9d269..040b640b0222 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,24 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* acl.c */ -#ifdef CONFIG_BTRFS_FS_POSIX_ACL -struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); -int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, - struct posix_acl *acl, int type); -int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, - struct posix_acl *acl, int type); -#else -#define btrfs_get_acl NULL -#define btrfs_set_acl NULL -static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct posix_acl *acl, - int type) -{ - return -EOPNOTSUPP; -} -#endif - /* relocation.c */ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3fe3301b88b3..cb038dbbca7d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -65,6 +65,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "file.h" +#include "acl.h" struct btrfs_iget_args { u64 ino; From 677074792a1d533232ec5517f23f78d64e6dffac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:34 -0400 Subject: [PATCH 116/249] btrfs: move relocation prototypes into relocation.h Move these out of ctree.h into relocation.h to cut down on code in ctree.h Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 + fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 20 -------------------- fs/btrfs/disk-io.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/relocation.h | 23 +++++++++++++++++++++++ fs/btrfs/transaction.c | 1 + fs/btrfs/volumes.c | 1 + 9 files changed, 30 insertions(+), 20 deletions(-) create mode 100644 fs/btrfs/relocation.h diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 173df40da984..04cb608e7cfb 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -18,6 +18,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "relocation.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4b47d380da1e..0acd85111cdf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -21,6 +21,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "relocation.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 040b640b0222..b1b6de508e20 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,26 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* relocation.c */ -int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); -int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, - struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); -int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); -int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct extent_buffer *buf, - struct extent_buffer *cow); -void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, - u64 *bytes_to_reserve); -int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, - struct btrfs_pending_snapshot *pending); -int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); -struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, - u64 bytenr); -int btrfs_should_ignore_reloc_root(struct btrfs_root *root); - /* scrub.c */ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, u64 end, struct btrfs_scrub_progress *progress, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e8082e3c5bdc..a613a9a7caae 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -49,6 +49,7 @@ #include "root-tree.h" #include "defrag.h" #include "uuid-tree.h" +#include "relocation.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cb038dbbca7d..d8856e621e01 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,7 @@ #include "ioctl.h" #include "file.h" #include "acl.h" +#include "relocation.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e86364bdac8e..f31a97d4f9ad 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -33,6 +33,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "file-item.h" +#include "relocation.h" /* * Relocation overview diff --git a/fs/btrfs/relocation.h b/fs/btrfs/relocation.h new file mode 100644 index 000000000000..2041a86186de --- /dev/null +++ b/fs/btrfs/relocation.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_RELOCATION_H +#define BTRFS_RELOCATION_H + +int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start); +int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); +int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); +int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *cow); +void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending, + u64 *bytes_to_reserve); +int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_pending_snapshot *pending); +int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info); +struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr); +int btrfs_should_ignore_reloc_root(struct btrfs_root *root); + +#endif diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index bbc3ad3e297c..a262fd188abe 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -32,6 +32,7 @@ #include "dir-item.h" #include "uuid-tree.h" #include "ioctl.h" +#include "relocation.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 823de2f29289..abfa6a9e20e6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -37,6 +37,7 @@ #include "accessors.h" #include "uuid-tree.h" #include "ioctl.h" +#include "relocation.h" static struct bio_set btrfs_bioset; From 2fc6822c99d7e902b7cef146efa37420d41c0c59 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:35 -0400 Subject: [PATCH 117/249] btrfs: move scrub prototypes into scrub.h Move these out of ctree.h into scrub.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 11 ----------- fs/btrfs/dev-replace.c | 1 + fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/scrub.c | 1 + fs/btrfs/scrub.h | 16 ++++++++++++++++ fs/btrfs/super.c | 1 + fs/btrfs/transaction.c | 1 + fs/btrfs/volumes.c | 1 + 9 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 fs/btrfs/scrub.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b1b6de508e20..f3facc10646c 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,17 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* scrub.c */ -int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, - u64 end, struct btrfs_scrub_progress *progress, - int readonly, int is_dev_replace); -void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); -void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); -int btrfs_scrub_cancel(struct btrfs_fs_info *info); -int btrfs_scrub_cancel_dev(struct btrfs_device *dev); -int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, - struct btrfs_scrub_progress *progress); - /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 94f8975034ce..84af2010fae2 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -25,6 +25,7 @@ #include "block-group.h" #include "fs.h" #include "accessors.h" +#include "scrub.h" /* * Device replace overview diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a613a9a7caae..880bd39680bc 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,6 +50,7 @@ #include "defrag.h" #include "uuid-tree.h" #include "relocation.h" +#include "scrub.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dba1f07b4194..98cf16f118b5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -59,6 +59,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "file.h" +#include "scrub.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index e46e6c4d4bf9..8c89e63ef2e8 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -24,6 +24,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "scrub.h" /* * This is only the first step towards a full-features scrub. It reads all diff --git a/fs/btrfs/scrub.h b/fs/btrfs/scrub.h new file mode 100644 index 000000000000..7639103ebf9d --- /dev/null +++ b/fs/btrfs/scrub.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SCRUB_H +#define BTRFS_SCRUB_H + +int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, + u64 end, struct btrfs_scrub_progress *progress, + int readonly, int is_dev_replace); +void btrfs_scrub_pause(struct btrfs_fs_info *fs_info); +void btrfs_scrub_continue(struct btrfs_fs_info *fs_info); +int btrfs_scrub_cancel(struct btrfs_fs_info *info); +int btrfs_scrub_cancel_dev(struct btrfs_device *dev); +int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, + struct btrfs_scrub_progress *progress); + +#endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 2cd05246bcd3..4bfda9be4556 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -55,6 +55,7 @@ #include "defrag.h" #include "dir-item.h" #include "ioctl.h" +#include "scrub.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index a262fd188abe..593b2414e2be 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -33,6 +33,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" +#include "scrub.h" static struct kmem_cache *btrfs_trans_handle_cachep; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index abfa6a9e20e6..5606060770eb 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -38,6 +38,7 @@ #include "uuid-tree.h" #include "ioctl.h" #include "relocation.h" +#include "scrub.h" static struct bio_set btrfs_bioset; From 77407dc032e29f205a1e3922564c0abfd2e23d15 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:36 -0400 Subject: [PATCH 118/249] btrfs: move dev-replace prototypes into dev-replace.h We already have a dev-replace.h, simply move these prototypes and helpers into dev-replace.h where they belong. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 9 --------- fs/btrfs/dev-replace.h | 8 ++++++++ fs/btrfs/extent_io.c | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f3facc10646c..84bc33ff003f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -732,15 +732,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) #define EXPORT_FOR_TESTS #endif -/* dev-replace.c */ -void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); -void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); - -static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) -{ - btrfs_bio_counter_sub(fs_info, 1); -} - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 6084b313056a..675082ccec89 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); +void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); +void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount); + +static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) +{ + btrfs_bio_counter_sub(fs_info, 1); +} + #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 7fe637408e98..6373b1565250 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -34,6 +34,7 @@ #include "accessors.h" #include "file-item.h" #include "file.h" +#include "dev-replace.h" static struct kmem_cache *extent_buffer_cache; From 5c11adcc383a94cacd652461c9435bf7a5c53c9c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:37 -0400 Subject: [PATCH 119/249] btrfs: move verity prototypes into verity.h Move these out of ctree.h into verity.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 22 ---------------------- fs/btrfs/inode.c | 1 + fs/btrfs/send.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/verity.c | 1 + fs/btrfs/verity.h | 28 ++++++++++++++++++++++++++++ 6 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 fs/btrfs/verity.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 84bc33ff003f..15bb90536460 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -741,28 +741,6 @@ static inline int is_fstree(u64 rootid) return 0; } -/* verity.c */ -#ifdef CONFIG_FS_VERITY - -extern const struct fsverity_operations btrfs_verityops; -int btrfs_drop_verity_items(struct btrfs_inode *inode); -int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); - -#else - -static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) -{ - return 0; -} - -static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, - size_t buf_size) -{ - return -EPERM; -} - -#endif - /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8856e621e01..0fda1c5ba28e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -67,6 +67,7 @@ #include "file.h" #include "acl.h" #include "relocation.h" +#include "verity.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index e9b7deccc5fe..3befc0d2d866 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -31,6 +31,7 @@ #include "dir-item.h" #include "file-item.h" #include "ioctl.h" +#include "verity.h" /* * Maximum number of references an extent can have in order for us to attempt to diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 4bfda9be4556..ae49bdf71d32 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -56,6 +56,7 @@ #include "dir-item.h" #include "ioctl.h" #include "scrub.h" +#include "verity.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 00ba5143a17d..b31d6c7627ff 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -19,6 +19,7 @@ #include "fs.h" #include "accessors.h" #include "ioctl.h" +#include "verity.h" /* * Implementation of the interface defined in struct fsverity_operations. diff --git a/fs/btrfs/verity.h b/fs/btrfs/verity.h new file mode 100644 index 000000000000..91c10f7d0a46 --- /dev/null +++ b/fs/btrfs/verity.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_VERITY_H +#define BTRFS_VERITY_H + +#ifdef CONFIG_FS_VERITY + +extern const struct fsverity_operations btrfs_verityops; + +int btrfs_drop_verity_items(struct btrfs_inode *inode); +int btrfs_get_verity_descriptor(struct inode *inode, void *buf, size_t buf_size); + +#else + +static inline int btrfs_drop_verity_items(struct btrfs_inode *inode) +{ + return 0; +} + +static inline int btrfs_get_verity_descriptor(struct inode *inode, void *buf, + size_t buf_size) +{ + return -EPERM; +} + +#endif + +#endif From 6a6b4daf92fd8393eeac9631318d48d5b25ee4df Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:38 -0400 Subject: [PATCH 120/249] btrfs: move CONFIG_BTRFS_FS_RUN_SANITY_TESTS checks to fs.h We already have a few of these in fs.h, move the remaining checks out of ctree.h into fs.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 15 --------------- fs/btrfs/fs.h | 9 +++++++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 15bb90536460..27bfedf3a9fb 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -722,16 +722,6 @@ static inline unsigned long get_eb_page_index(unsigned long offset) return offset >> PAGE_SHIFT; } -/* - * Use that for functions that are conditionally exported for sanity tests but - * otherwise static - */ -#ifndef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -#define EXPORT_FOR_TESTS static -#else -#define EXPORT_FOR_TESTS -#endif - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || @@ -741,11 +731,6 @@ static inline int is_fstree(u64 rootid) return 0; } -/* Sanity test specific functions */ -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_destroy_inode(struct inode *inode); -#endif - static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) { return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index a49d11127bf7..7d0da8509567 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -971,11 +971,20 @@ static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) &(fs_info)->fs_state))) #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + +#define EXPORT_FOR_TESTS + static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); } + +void btrfs_test_destroy_inode(struct inode *inode); + #else + +#define EXPORT_FOR_TESTS static + static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return 0; From c03b22076bd2c9fe88c055a35637a836d986db76 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:39 -0400 Subject: [PATCH 121/249] btrfs: move super prototypes into super.h Move these out of ctree.h into super.h to cut down on code in ctree.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 7 ------- fs/btrfs/disk-io.c | 1 + fs/btrfs/ioctl.c | 1 + fs/btrfs/super.c | 1 + fs/btrfs/super.h | 12 ++++++++++++ 5 files changed, 15 insertions(+), 7 deletions(-) create mode 100644 fs/btrfs/super.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 27bfedf3a9fb..c32f6b6ae972 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -682,13 +682,6 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset); -/* super.c */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); -int btrfs_sync_fs(struct super_block *sb, int wait); -char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, - u64 subvol_objectid); - /* * Get the correct offset inside the page of extent buffer. * diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 880bd39680bc..f5f793af12a0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -51,6 +51,7 @@ #include "uuid-tree.h" #include "relocation.h" #include "scrub.h" +#include "super.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 98cf16f118b5..6f9f8e8dfea6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -60,6 +60,7 @@ #include "ioctl.h" #include "file.h" #include "scrub.h" +#include "super.h" #ifdef CONFIG_64BIT /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ae49bdf71d32..d54bfec8e506 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -57,6 +57,7 @@ #include "ioctl.h" #include "scrub.h" #include "verity.h" +#include "super.h" #define CREATE_TRACE_POINTS #include diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h new file mode 100644 index 000000000000..c8875653e628 --- /dev/null +++ b/fs/btrfs/super.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_SUPER_H +#define BTRFS_SUPER_H + +int btrfs_parse_options(struct btrfs_fs_info *info, char *options, + unsigned long new_flags); +int btrfs_sync_fs(struct super_block *sb, int wait); +char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid); + +#endif From 7f0add250f829248281e1745d92648f72192a8f4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:40 -0400 Subject: [PATCH 122/249] btrfs: move super_block specific helpers into super.h This will make syncing fs.h to user space a little easier if we can pull the super block specific helpers out of fs.h and put them in super.h. Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/compression.c | 1 + fs/btrfs/defrag.c | 1 + fs/btrfs/export.c | 1 + fs/btrfs/extent_io.c | 1 + fs/btrfs/file-item.c | 1 + fs/btrfs/file.c | 1 + fs/btrfs/free-space-cache.c | 1 + fs/btrfs/fs.h | 17 ----------------- fs/btrfs/inode.c | 1 + fs/btrfs/lzo.c | 1 + fs/btrfs/messages.c | 1 + fs/btrfs/ordered-data.c | 1 + fs/btrfs/props.c | 1 + fs/btrfs/reflink.c | 1 + fs/btrfs/relocation.c | 1 + fs/btrfs/super.h | 17 +++++++++++++++++ fs/btrfs/volumes.c | 1 + 17 files changed, 32 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 61828f8375e6..e66fa18dbcf7 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -35,6 +35,7 @@ #include "subpage.h" #include "zoned.h" #include "file-item.h" +#include "super.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index ed82085acea3..48a9460d49c7 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -15,6 +15,7 @@ #include "subpage.h" #include "defrag.h" #include "file-item.h" +#include "super.h" static struct kmem_cache *btrfs_inode_defrag_cachep; diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index b6bc9684648f..744a02b7fd67 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -8,6 +8,7 @@ #include "print-tree.h" #include "export.h" #include "accessors.h" +#include "super.h" #define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ parent_objectid) / 4) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6373b1565250..bda420d697f4 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -35,6 +35,7 @@ #include "file-item.h" #include "file.h" #include "dev-replace.h" +#include "super.h" static struct kmem_cache *extent_buffer_cache; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 20d88cd0b602..036d50af5666 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -20,6 +20,7 @@ #include "fs.h" #include "accessors.h" #include "file-item.h" +#include "super.h" #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) * 2) / \ diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b94dc4b2c486..f9bea57abbde 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -36,6 +36,7 @@ #include "file-item.h" #include "ioctl.h" #include "file.h" +#include "super.h" /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index aef075b63188..797edb41d0aa 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -29,6 +29,7 @@ #include "accessors.h" #include "file-item.h" #include "file.h" +#include "super.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) #define MAX_CACHE_BYTES_PER_GIG SZ_64K diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 7d0da8509567..c7f2a512fba2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -801,11 +801,6 @@ static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_in return READ_ONCE(fs_info->last_root_drop_gen); } -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) -{ - return sb->s_fs_info; -} - /* * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. @@ -947,18 +942,6 @@ static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info) btrfs_fs_closing(fs_info); } -static inline void btrfs_set_sb_rdonly(struct super_block *sb) -{ - sb->s_flags |= SB_RDONLY; - set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - -static inline void btrfs_clear_sb_rdonly(struct super_block *sb) -{ - sb->s_flags &= ~SB_RDONLY; - clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); -} - static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) { clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0fda1c5ba28e..83e5ae6b74ef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -68,6 +68,7 @@ #include "acl.h" #include "relocation.h" #include "verity.h" +#include "super.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 6751874a3e69..e7b1ceffcd33 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -16,6 +16,7 @@ #include "messages.h" #include "compression.h" #include "ctree.h" +#include "super.h" #define LZO_LEN 4 diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 196757ee16f1..625bbbbb2608 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -5,6 +5,7 @@ #include "discard.h" #include "transaction.h" #include "space-info.h" +#include "super.h" #ifdef CONFIG_PRINTK diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 2aea2a17ed95..8fda1949b71b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -19,6 +19,7 @@ #include "qgroup.h" #include "subpage.h" #include "file.h" +#include "super.h" static struct kmem_cache *btrfs_ordered_extent_cache; diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 9ad15d69718c..0755af0e53e3 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -14,6 +14,7 @@ #include "space-info.h" #include "fs.h" #include "accessors.h" +#include "super.h" #define BTRFS_PROP_HANDLERS_HT_BITS 8 static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 3c962b5d8dbd..9d728107536e 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -14,6 +14,7 @@ #include "accessors.h" #include "file-item.h" #include "file.h" +#include "super.h" #define BTRFS_MAX_DEDUPE_LEN SZ_16M diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f31a97d4f9ad..d119986d1599 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -34,6 +34,7 @@ #include "root-tree.h" #include "file-item.h" #include "relocation.h" +#include "super.h" /* * Relocation overview diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index c8875653e628..8dbb909b364f 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -9,4 +9,21 @@ int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); +static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline void btrfs_set_sb_rdonly(struct super_block *sb) +{ + sb->s_flags |= SB_RDONLY; + set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + +static inline void btrfs_clear_sb_rdonly(struct super_block *sb) +{ + sb->s_flags &= ~SB_RDONLY; + clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); +} + #endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5606060770eb..1c9f7a946657 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -39,6 +39,7 @@ #include "ioctl.h" #include "relocation.h" #include "scrub.h" +#include "super.h" static struct bio_set btrfs_bioset; From aa5d3003ddee8d7c5c517db072f888e114ff1529 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Oct 2022 15:08:41 -0400 Subject: [PATCH 123/249] btrfs: move orphan prototypes into orphan.h Move these out of ctree.h into orphan.h to cut down on code in ctree.h. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 6 ------ fs/btrfs/extent-tree.c | 1 + fs/btrfs/inode.c | 1 + fs/btrfs/orphan.c | 1 + fs/btrfs/orphan.h | 11 +++++++++++ fs/btrfs/root-tree.c | 1 + fs/btrfs/tree-log.c | 1 + fs/btrfs/verity.c | 1 + 8 files changed, 17 insertions(+), 6 deletions(-) create mode 100644 fs/btrfs/orphan.h diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c32f6b6ae972..5649f8907984 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -676,12 +676,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(struct extent_buffer *leaf); -/* orphan.c */ -int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); -int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 offset); - /* * Get the correct offset inside the page of extent buffer. * diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 940d4fe23cfb..b037107678c8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -41,6 +41,7 @@ #include "extent-tree.h" #include "root-tree.h" #include "file-item.h" +#include "orphan.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83e5ae6b74ef..9abed40ade11 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -69,6 +69,7 @@ #include "relocation.h" #include "verity.h" #include "super.h" +#include "orphan.h" struct btrfs_iget_args { u64 ino; diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index aa534108c1e2..7a1b021b5669 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -5,6 +5,7 @@ #include "ctree.h" #include "disk-io.h" +#include "orphan.h" int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 offset) diff --git a/fs/btrfs/orphan.h b/fs/btrfs/orphan.h new file mode 100644 index 000000000000..3faab5cbb59a --- /dev/null +++ b/fs/btrfs/orphan.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef BTRFS_ORPHAN_H +#define BTRFS_ORPHAN_H + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index 42f046e5e25f..859874579456 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -15,6 +15,7 @@ #include "space-info.h" #include "accessors.h" #include "root-tree.h" +#include "orphan.h" /* * Read a root item from the tree. In case we detect a root item smaller then diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1c505713511c..b6e99ef99679 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -28,6 +28,7 @@ #include "dir-item.h" #include "file-item.h" #include "file.h" +#include "orphan.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index b31d6c7627ff..bf9eb693a6a7 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -20,6 +20,7 @@ #include "accessors.h" #include "ioctl.h" #include "verity.h" +#include "orphan.h" /* * Implementation of the interface defined in struct fsverity_operations. From d52a1365258b5781c2635937afa91af66322d981 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:35 +0800 Subject: [PATCH 124/249] btrfs: selftests: remove impossible inline extent at non-zero file offset In our inode-tests.c, we create an inline offset at file offset 5, which is no longer possible since the introduction of tree-checker. Thus I don't think we should spend time maintaining some corner cases which are already ruled out by tree-checker. So this patch will: - Change the inline extent to start at file offset 0 Also change its length to 6 to cover the original length - Add an extra ASSERT() for btrfs_add_extent_mapping() This is to make sure tree-checker is working correctly. - Update the inode selftest Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 7 +++++ fs/btrfs/tests/inode-tests.c | 57 ++++++++++++------------------------ 2 files changed, 26 insertions(+), 38 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index cd45303f344d..4a4362f5cc52 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -638,6 +638,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, int ret; struct extent_map *em = *em_in; + /* + * Tree-checker should have rejected any inline extent with non-zero + * file offset. Here just do a sanity check. + */ + if (em->block_start == EXTENT_MAP_INLINE) + ASSERT(em->start == 0); + ret = add_extent_mapping(em_tree, em, 0); /* it is possible that someone inserted the extent into the tree * while we had the lock dropped. It is also possible that diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 0a34a54ea9fd..05b03f5eab83 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -73,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root) * diagram of how the extents will look though this may not be possible we still * want to make sure everything acts normally (the last number is not inclusive) * - * [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] - * [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split] + * [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291] + * [inline][hole but no extent][ hole ][ regular ][regular1 split] * * [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ] * [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written] @@ -91,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) u64 disk_bytenr = SZ_1M; u64 offset = 0; - /* First we want a hole */ - insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0, - slot); - slot++; - offset += 5; - /* - * Now we want an inline extent, I don't think this is possible but hey - * why not? Also keep in mind if we have an inline extent it counts as - * the whole first page. If we were to expand it we would have to cow - * and we wouldn't have an inline extent anymore. + * Tree-checker has strict limits on inline extents that they can only + * exist at file offset 0, thus we can only have one inline file extent + * at most. */ - insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, + insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0, slot); slot++; offset = sectorsize; @@ -282,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("got an error when we shouldn't have"); goto out; } - if (em->block_start != EXTENT_MAP_HOLE) { - test_err("expected a hole, got %llu", em->block_start); - goto out; - } - if (em->start != 0 || em->len != 5) { - test_err( - "unexpected extent wanted start 0 len 5, got start %llu len %llu", - em->start, em->len); - goto out; - } - if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); - goto out; - } - offset = em->start + em->len; - free_extent_map(em); - - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); - if (IS_ERR(em)) { - test_err("got an error when we shouldn't have"); - goto out; - } if (em->block_start != EXTENT_MAP_INLINE) { test_err("expected an inline, got %llu", em->block_start); goto out; } - if (em->start != offset || em->len != (sectorsize - 5)) { + /* + * For inline extent, we always round up the em to sectorsize, as + * they are either: + * + * a) a hidden hole + * The range will be zeroed at inline extent read time. + * + * b) a file extent with unaligned bytenr + * Tree checker will reject it. + */ + if (em->start != 0 || em->len != sectorsize) { test_err( - "unexpected extent wanted start %llu len 1, got start %llu len %llu", - offset, em->start, em->len); + "unexpected extent wanted start 0 len %u, got start %llu len %llu", + sectorsize, em->start, em->len); goto out; } if (em->flags != 0) { From affc5424338682641829cc84293ec90f36425034 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:36 +0800 Subject: [PATCH 125/249] btrfs: make inline extent read calculation much simpler Currently we calculate inline extent read in a way that inline extent can start at non-zero offset. This is consistent with the inode selftests, which puts an inline extent at file offset 5. Meanwhile the inline extent creation code will only create inline extent at file offset 0. Furthermore with the introduction of tree-checker on file extents, we are actively rejecting inline extent which starts at non-zero file offset. And so far we haven't yet seen any report of rejected inline extents at non-zero file offset. This all means, the extra calculation to support inline extents at non-zero file offset is mostly paper weight, and damaging the readability of the code. Thus this patch will: - Add extra ASSERT()s to make sure involved file offset are all 0 - Remove @extent_offset calculation - Simplify the involved code As several variables are now single-use, no need to declare them as a variable anymore. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9abed40ade11..a374dd718315 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7064,41 +7064,43 @@ next: extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - unsigned long ptr; char *map; - size_t size; - size_t extent_offset; size_t copy_size; if (!page) goto out; - size = btrfs_file_extent_ram_bytes(leaf, item); - extent_offset = page_offset(page) + pg_offset - extent_start; - copy_size = min_t(u64, PAGE_SIZE - pg_offset, - size - extent_offset); - em->start = extent_start + extent_offset; + /* + * Inline extent can only exist at file offset 0. This is + * ensured by tree-checker and inline extent creation path. + * Thus all members representing file offsets should be zero. + */ + ASSERT(page_offset(page) == 0); + ASSERT(pg_offset == 0); + ASSERT(extent_start == 0); + ASSERT(em->start == 0); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(leaf, item)); + em->start = extent_start; em->len = ALIGN(copy_size, fs_info->sectorsize); em->orig_block_len = em->len; em->orig_start = em->start; - ptr = btrfs_file_extent_inline_start(item) + extent_offset; if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, page, pg_offset, - extent_offset, item); + ret = uncompress_inline(path, page, 0, 0, item); if (ret) goto out; } else { map = kmap_local_page(page); - read_extent_buffer(leaf, map + pg_offset, ptr, - copy_size); - if (pg_offset + copy_size < PAGE_SIZE) { - memset(map + pg_offset + copy_size, 0, - PAGE_SIZE - pg_offset - - copy_size); - } + read_extent_buffer(leaf, map, + btrfs_file_extent_inline_start(item), + copy_size); + if (copy_size < PAGE_SIZE) + memset(map + copy_size, 0, + PAGE_SIZE - copy_size); kunmap_local(map); } flush_dcache_page(page); From a196a8944f77b7b762795a0862d8aa4a005625a4 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:37 +0800 Subject: [PATCH 126/249] btrfs: do not reset extent map members for inline extents read Currently for inline extents read inside btrfs_get_extent(), we will reset several extent map members: - em->start Reset to extent_start, which is completely unnecessary. The extent_start and em->start should have already be zero, ensured by tree-checker already. - em->len Reset the round_up(copy_size, fs_info->sectorsize), which is again unnecessary. - em->orig_block_len Reset to em->len (sectorsize), while it is originally unset from btrfs_extent_item_to_extent_map(). This makes no difference, as all extent map handling paths will ignore the orig_block_len if they found it's an inlined extent. Such inline extent orig_block_len ignoring examples can be found in btrfs_drop_extent_cache(). - em->orig_start Reset to em->start (0), while it is originally set to EXTENT_MAP_HOLE. This makes no difference either, as all extent map handling paths will ignore the em->orig_start if they found it's an inline extent. Thus all these em members resetting are unnecessary. Replace them with ASSERT()s checking the only two members (block_start and length) that make sense. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a374dd718315..5b45eb141215 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7082,10 +7082,15 @@ next: copy_size = min_t(u64, PAGE_SIZE, btrfs_file_extent_ram_bytes(leaf, item)); - em->start = extent_start; - em->len = ALIGN(copy_size, fs_info->sectorsize); - em->orig_block_len = em->len; - em->orig_start = em->start; + + /* + * btrfs_extent_item_to_extent_map() should have properly + * initialized em members already. + * + * Other members are not utilized for inline extents. + */ + ASSERT(em->block_start == EXTENT_MAP_INLINE); + ASSERT(em->len = fs_info->sectorsize); if (!PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != From 280f15cb96a61e4122bb28cdc316343ff4918b7d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:38 +0800 Subject: [PATCH 127/249] btrfs: remove new_inline argument from btrfs_extent_item_to_extent_map() The argument @new_inline changes the following members of extent_map: - em->compress_type - EXTENT_FLAG_COMPRESSED of em->flags However neither members makes a difference for inline extents: - Inline extent read never use above em members As inside btrfs_get_extent() we directly use the file extent item to do the read. - Inline extents are never to be split Thus code really needs em->compress_type or that flag will never be executed on inlined extents. (btrfs_drop_extent_cache() would be one example) - Fiemap no longer relies on extent maps Recent fiemap optimization makes fiemap to search subvolume tree directly, without using any extent map at all. Thus those members make no difference for inline extents any more. Furthermore such exception without much explanation is really a source of confusion. Thus this patch will completely remove the argument, and always set the involved members, unifying the behavior. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/defrag.c | 2 +- fs/btrfs/file-item.c | 6 ++---- fs/btrfs/file-item.h | 1 - fs/btrfs/inode.c | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 48a9460d49c7..919dfe0f7e50 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -582,7 +582,7 @@ iterate: goto next; /* Now this extent covers @start, convert it to em */ - btrfs_extent_item_to_extent_map(inode, &path, fi, false, em); + btrfs_extent_item_to_extent_map(inode, &path, fi, em); break; next: ret = btrfs_next_item(root, &path); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 036d50af5666..456f71b42a9c 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1214,7 +1214,6 @@ out: void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, - const bool new_inline, struct extent_map *em) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1266,10 +1265,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) { + em->compress_type = compress_type; + if (compress_type != BTRFS_COMPRESS_NONE) set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 51cd3ab5948c..ba12711cf933 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -24,7 +24,6 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, - const bool new_inline, struct extent_map *em); int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5b45eb141215..0ab899132dbf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7058,7 +7058,7 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, !page, em); + btrfs_extent_item_to_extent_map(inode, path, item, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { From a982fc822001e26930b502b16534f6bbd42b232a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 16 Sep 2022 15:28:39 +0800 Subject: [PATCH 128/249] btrfs: extract the inline extent read code into its own function Currently we have inline extent read code behind two levels of indentation, factor them them out into a new function, read_inline_extent(), to make it a little easier to read. Since we're here, also remove @extent_offset and @pg_offset arguments from uncompress_inline() function, as it's not possible to have inline extents at non-inline file offset. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/inode.c | 69 ++++++++++++++++++++++++------------------------ 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0ab899132dbf..1422f072a7b8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6865,7 +6865,6 @@ static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, static noinline int uncompress_inline(struct btrfs_path *path, struct page *page, - size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { int ret; @@ -6876,7 +6875,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, unsigned long ptr; int compress_type; - WARN_ON(pg_offset != 0); compress_type = btrfs_file_extent_compression(leaf, item); max_size = btrfs_file_extent_ram_bytes(leaf, item); inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); @@ -6888,8 +6886,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); max_size = min_t(unsigned long, PAGE_SIZE, max_size); - ret = btrfs_decompress(compress_type, tmp, page, - extent_offset, inline_size, max_size); + ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size); /* * decompression code contains a memset to fill in any space between the end @@ -6899,13 +6896,40 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) - memzero_page(page, pg_offset + max_size, - PAGE_SIZE - max_size - pg_offset); + if (max_size < PAGE_SIZE) + memzero_page(page, max_size, PAGE_SIZE - max_size); kfree(tmp); return ret; } +static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, + struct page *page) +{ + struct btrfs_file_extent_item *fi; + void *kaddr; + size_t copy_size; + + if (!page || PageUptodate(page)) + return 0; + + ASSERT(page_offset(page) == 0); + + fi = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) + return uncompress_inline(path, page, fi); + + copy_size = min_t(u64, PAGE_SIZE, + btrfs_file_extent_ram_bytes(path->nodes[0], fi)); + kaddr = kmap_local_page(page); + read_extent_buffer(path->nodes[0], kaddr, + btrfs_file_extent_inline_start(fi), copy_size); + kunmap_local(kaddr); + if (copy_size < PAGE_SIZE) + memzero_page(page, copy_size, PAGE_SIZE - copy_size); + return 0; +} + /* * Lookup the first extent overlapping a range in a file. * @@ -7064,25 +7088,15 @@ next: extent_type == BTRFS_FILE_EXTENT_PREALLOC) { goto insert; } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - char *map; - size_t copy_size; - - if (!page) - goto out; - /* * Inline extent can only exist at file offset 0. This is * ensured by tree-checker and inline extent creation path. * Thus all members representing file offsets should be zero. */ - ASSERT(page_offset(page) == 0); ASSERT(pg_offset == 0); ASSERT(extent_start == 0); ASSERT(em->start == 0); - copy_size = min_t(u64, PAGE_SIZE, - btrfs_file_extent_ram_bytes(leaf, item)); - /* * btrfs_extent_item_to_extent_map() should have properly * initialized em members already. @@ -7092,24 +7106,9 @@ next: ASSERT(em->block_start == EXTENT_MAP_INLINE); ASSERT(em->len = fs_info->sectorsize); - if (!PageUptodate(page)) { - if (btrfs_file_extent_compression(leaf, item) != - BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, page, 0, 0, item); - if (ret) - goto out; - } else { - map = kmap_local_page(page); - read_extent_buffer(leaf, map, - btrfs_file_extent_inline_start(item), - copy_size); - if (copy_size < PAGE_SIZE) - memset(map + copy_size, 0, - PAGE_SIZE - copy_size); - kunmap_local(map); - } - flush_dcache_page(page); - } + ret = read_inline_extent(inode, path, page); + if (ret < 0) + goto out; goto insert; } not_found: From c30ff698da87aeabb459cca75fe4a77a858609fb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 18:12:04 +0200 Subject: [PATCH 129/249] btrfs: fix SPDX comment in tree-mod-log.h The header files should use the /* */ comment style, introduced in commit f3a84ccd28d0 ("btrfs: move the tree mod log code into its own file"). Signed-off-by: David Sterba --- fs/btrfs/tree-mod-log.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h index 8cffe0bc2a39..94f10afeee97 100644 --- a/fs/btrfs/tree-mod-log.h +++ b/fs/btrfs/tree-mod-log.h @@ -1,4 +1,4 @@ -// SPDX-License-Identifier: GPL-2.0 +/* SPDX-License-Identifier: GPL-2.0 */ #ifndef BTRFS_TREE_MOD_LOG_H #define BTRFS_TREE_MOD_LOG_H From 20af93d97f46a824647326278df7d377ba3269ff Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 31 Oct 2022 11:43:56 +0000 Subject: [PATCH 130/249] btrfs: update stale comment for nowait direct IO writes If when doing a direct IO write we need to fallback to buffered IO, we this comment at btrfs_direct_write() that says we can't directly fallback to buffered IO if we have a NOWAIT iocb, because we have no support for NOWAIT buffered writes. That is not true anymore, as support for NOWAIT buffered writes was added recently in commit 926078b21db9 ("btrfs: enable nowait async buffered writes"). However we still can't fallback to a buffered write in case we have a NOWAIT iocb, because we'll need to flush delalloc and wait for it to complete after doing the buffered write, and that can block for several reasons, the main reason being waiting for IO to complete. So update the comment to mention all that. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f9bea57abbde..f8be9d629e75 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1575,8 +1575,8 @@ buffered: /* * If we are in a NOWAIT context, then return -EAGAIN to signal the caller * it must retry the operation in a context where blocking is acceptable, - * since we currently don't have NOWAIT semantics support for buffered IO - * and may block there for many reasons (reserving space for example). + * because even if we end up not blocking during the buffered IO attempt + * below, we will block when flushing and waiting for the IO. */ if (iocb->ki_flags & IOCB_NOWAIT) { err = -EAGAIN; From 428c8e03109e717d90e0d1329dc3d926b9421ad3 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Wed, 26 Oct 2022 23:25:14 +0200 Subject: [PATCH 131/249] btrfs: simplify percent calculation helpers, rename div_factor The div_factor* helpers calculate fraction or percentage fraction. The name is a bit confusing, we use it only for percentage calculations and there are two helpers. There's a helper mult_frac that's for general fractions, that tries to be accurate but we multiply and divide by small numbers so we can use the div_u64 helper. Rename the div_factor* helpers and use 1..100 percentage range, also drop the case checking for percentage == 100, it's never hit. The conversions: * div_factor calculates tenths and the numbers need to be adjusted * div_factor_fine is direct replacement Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 6 +++--- fs/btrfs/block-rsv.c | 4 ++-- fs/btrfs/block-rsv.h | 2 +- fs/btrfs/free-space-cache.c | 3 +-- fs/btrfs/misc.h | 16 ++-------------- fs/btrfs/space-info.c | 4 ++-- fs/btrfs/sysfs.c | 2 +- fs/btrfs/transaction.c | 2 +- fs/btrfs/volumes.c | 12 +++++------- 9 files changed, 18 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index cfe495d39ebd..ea5fcb665daa 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1551,7 +1551,7 @@ static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_f if (reclaim_thresh == 0) return false; - thresh = div_factor_fine(bg->length, reclaim_thresh); + thresh = mult_perc(bg->length, reclaim_thresh); /* * If we were below the threshold before don't reclaim, we are likely a @@ -3523,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(fs_info->super_copy); - thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) return 1; } - if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8)) + if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) return 0; return 1; } diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 29705256727f..5367a14d44d2 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -227,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, return ret; } -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent) { u64 num_bytes = 0; int ret = -ENOSPC; @@ -236,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor) return 0; spin_lock(&block_rsv->lock); - num_bytes = div_factor(block_rsv->size, min_factor); + num_bytes = mult_perc(block_rsv->size, min_percent); if (block_rsv->reserved >= num_bytes) ret = 0; spin_unlock(&block_rsv->lock); diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index e2cf0dd43203..4cc41c9aaa82 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -63,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 num_bytes, enum btrfs_reserve_flush_enum flush); -int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor); +int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent); int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, u64 min_reserved, enum btrfs_reserve_flush_enum flush); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 797edb41d0aa..627bd6120368 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -2726,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, btrfs_mark_bg_unused(block_group); } else if (bg_reclaim_threshold && reclaimable_unusable >= - div_factor_fine(block_group->zone_capacity, - bg_reclaim_threshold)) { + mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) { btrfs_mark_bg_to_reclaim(block_group); } diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 69826ccd6644..768583a440e1 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -40,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq) wake_up(wq); } -static inline u64 div_factor(u64 num, int factor) +static inline u64 mult_perc(u64 num, u32 percent) { - if (factor == 10) - return num; - num *= factor; - return div_u64(num, 10); + return div_u64(num * percent, 100); } - -static inline u64 div_factor_fine(u64 num, int factor) -{ - if (factor == 100) - return num; - num *= factor; - return div_u64(num, 100); -} - /* Copy of is_power_of_two that is 64bit safe */ static inline bool is_power_of_two_u64(u64 n) { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 11e5b5a1eb5a..d28ee4e36f3d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -859,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, u64 thresh; u64 used; - thresh = div_factor_fine(total, 90); + thresh = mult_perc(total, 90); lockdep_assert_held(&space_info->lock); @@ -977,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info, return false; spin_lock(&global_rsv->lock); - min_bytes = div_factor(global_rsv->size, 1); + min_bytes = mult_perc(global_rsv->size, 10); if (global_rsv->reserved < min_bytes + ticket->bytes) { spin_unlock(&global_rsv->lock); return false; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a14812f1254a..45615ce36498 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -764,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); /* Limit stripe size to 10% of available space. */ - val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val); /* Must be multiple of 256M. */ val &= ~((u64)SZ_256M - 1); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 593b2414e2be..2e2dd2ea109b 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -949,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans) if (btrfs_check_space_for_delayed_refs(fs_info)) return true; - return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5); + return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50); } bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1c9f7a946657..7751ab620761 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3614,16 +3614,14 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off if (bargs->usage_min == 0) user_thresh_min = 0; else - user_thresh_min = div_factor_fine(cache->length, - bargs->usage_min); + user_thresh_min = mult_perc(cache->length, bargs->usage_min); if (bargs->usage_max == 0) user_thresh_max = 1; else if (bargs->usage_max > 100) user_thresh_max = cache->length; else - user_thresh_max = div_factor_fine(cache->length, - bargs->usage_max); + user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) ret = 0; @@ -3647,7 +3645,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, else if (bargs->usage > 100) user_thresh = cache->length; else - user_thresh = div_factor_fine(cache->length, bargs->usage); + user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) ret = 0; @@ -5113,7 +5111,7 @@ static void init_alloc_chunk_ctl_policy_regular( ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ - ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10), ctl->max_chunk_size); ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes; } @@ -5144,7 +5142,7 @@ static void init_alloc_chunk_ctl_policy_zoned( } /* We don't want a chunk larger than 10% of writable space */ - limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1), + limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10), zone_size), min_chunk_size); ctl->max_chunk_size = min(limit, ctl->max_chunk_size); From 8ec8519b47897d01a7e8c5ad95734529a3dd60bf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 13:07:03 +0200 Subject: [PATCH 132/249] btrfs: switch extent_page_data bit fields to bools The semantics of the two members is a boolean, so change the type accordingly. We have space in extent_page_data due to alignment there's no change in size. Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bda420d697f4..c1294a6de88a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -109,10 +109,10 @@ struct extent_page_data { /* tells writepage not to lock the state bits for this range * it still does the unlocking */ - unsigned int extent_locked:1; + bool extent_locked; /* tells the submit_bio code to use REQ_SYNC */ - unsigned int sync_io:1; + bool sync_io; }; static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) From ee5f017dccc79e5a9b442ca473631a53e1e7e376 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 13:07:05 +0200 Subject: [PATCH 133/249] btrfs: merge struct extent_page_data to btrfs_bio_ctrl The two structures appear on the same call paths, btrfs_bio_ctrl is embedded in extent_page_data and we pass bio_ctrl to some functions. After merging there are fewer indirections and we have only one control structure. The packing remains same. The btrfs_bio_ctrl was selected as the target structure as the operation is closer to bio processing. Structure layout: struct btrfs_bio_ctrl { struct bio * bio; /* 0 8 */ int mirror_num; /* 8 4 */ enum btrfs_compression_type compress_type; /* 12 4 */ u32 len_to_stripe_boundary; /* 16 4 */ u32 len_to_oe_boundary; /* 20 4 */ btrfs_bio_end_io_t end_io_func; /* 24 8 */ bool extent_locked; /* 32 1 */ bool sync_io; /* 33 1 */ /* size: 40, cachelines: 1, members: 8 */ /* padding: 6 */ /* last cacheline: 40 bytes */ }; Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 115 +++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 60 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c1294a6de88a..2ec989b83f54 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -102,16 +102,14 @@ struct btrfs_bio_ctrl { u32 len_to_stripe_boundary; u32 len_to_oe_boundary; btrfs_bio_end_io_t end_io_func; -}; -struct extent_page_data { - struct btrfs_bio_ctrl bio_ctrl; - /* tells writepage not to lock the state bits for this range - * it still does the unlocking + /* + * Tell writepage not to lock the state bits for this range, it still + * does the unlocking. */ bool extent_locked; - /* tells the submit_bio code to use REQ_SYNC */ + /* Tell the submit_bio code to use REQ_SYNC */ bool sync_io; }; @@ -148,11 +146,11 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) } /* - * Submit or fail the current bio in an extent_page_data structure. + * Submit or fail the current bio in the bio_ctrl structure. */ -static void submit_write_bio(struct extent_page_data *epd, int ret) +static void submit_write_bio(struct btrfs_bio_ctrl *bio_ctrl, int ret) { - struct bio *bio = epd->bio_ctrl.bio; + struct bio *bio = bio_ctrl->bio; if (!bio) return; @@ -161,9 +159,9 @@ static void submit_write_bio(struct extent_page_data *epd, int ret) ASSERT(ret < 0); btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); /* The bio is owned by the end_io handler now */ - epd->bio_ctrl.bio = NULL; + bio_ctrl->bio = NULL; } else { - submit_one_bio(&epd->bio_ctrl); + submit_one_bio(bio_ctrl); } } @@ -2071,7 +2069,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, loff_t i_size, int *nr_ret) { @@ -2103,7 +2101,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ wbc->nr_to_write--; - epd->bio_ctrl.end_io_func = end_bio_extent_writepage; + bio_ctrl->end_io_func = end_bio_extent_writepage; while (cur <= end) { u64 disk_bytenr; u64 em_end; @@ -2197,7 +2195,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, btrfs_page_clear_dirty(fs_info, page, cur, iosize); ret = submit_extent_page(op | write_flags, wbc, - &epd->bio_ctrl, disk_bytenr, + bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page), 0, false); @@ -2237,7 +2235,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * Return <0 for error. */ static int __extent_writepage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; @@ -2274,7 +2272,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - if (!epd->extent_locked) { + if (!bio_ctrl->extent_locked) { ret = writepage_delalloc(BTRFS_I(inode), page, wbc); if (ret == 1) return 0; @@ -2282,7 +2280,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, goto done; } - ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size, + ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, bio_ctrl, i_size, &nr); if (ret == 1) return 0; @@ -2326,9 +2324,9 @@ done: */ if (PageError(page)) end_extent_writepage(page, ret, page_start, page_end); - if (epd->extent_locked) { + if (bio_ctrl->extent_locked) { /* - * If epd->extent_locked, it's from extent_write_locked_range(), + * If bio_ctrl->extent_locked, it's from extent_write_locked_range(), * the page can either be locked by lock_page() or * process_one_page(). * Let btrfs_page_unlock_writer() handle both cases. @@ -2367,7 +2365,7 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) * Return <0 if something went wrong, no page is locked. */ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; int i, num_pages; @@ -2375,17 +2373,17 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; btrfs_tree_lock(eb); } if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) { btrfs_tree_unlock(eb); - if (!epd->sync_io) + if (!bio_ctrl->sync_io) return 0; if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } while (1) { @@ -2432,7 +2430,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); flush = 1; } lock_page(p); @@ -2672,7 +2670,7 @@ static void prepare_eb_write(struct extent_buffer *eb) */ static int write_one_subpage_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; @@ -2692,10 +2690,10 @@ static int write_one_subpage_eb(struct extent_buffer *eb, if (no_dirty_ebs) clear_page_dirty_for_io(page); - epd->bio_ctrl.end_io_func = end_bio_subpage_eb_writepage; + bio_ctrl->end_io_func = end_bio_subpage_eb_writepage; ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, eb->start, page, eb->len, + bio_ctrl, eb->start, page, eb->len, eb->start - page_offset(page), 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); @@ -2718,7 +2716,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { u64 disk_bytenr = eb->start; int i, num_pages; @@ -2727,7 +2725,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, prepare_eb_write(eb); - epd->bio_ctrl.end_io_func = end_bio_extent_buffer_writepage; + bio_ctrl->end_io_func = end_bio_extent_buffer_writepage; num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -2736,7 +2734,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, clear_page_dirty_for_io(p); set_page_writeback(p); ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, - &epd->bio_ctrl, disk_bytenr, p, + bio_ctrl, disk_bytenr, p, PAGE_SIZE, 0, 0, false); if (ret) { set_btree_ioerr(p, eb); @@ -2779,7 +2777,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, */ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); int submitted = 0; @@ -2832,7 +2830,7 @@ static int submit_eb_subpage(struct page *page, if (!eb) continue; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret == 0) { free_extent_buffer(eb); continue; @@ -2841,7 +2839,7 @@ static int submit_eb_subpage(struct page *page, free_extent_buffer(eb); goto cleanup; } - ret = write_one_subpage_eb(eb, wbc, epd); + ret = write_one_subpage_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) goto cleanup; @@ -2851,7 +2849,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - submit_write_bio(epd, ret); + submit_write_bio(bio_ctrl, ret); return ret; } @@ -2876,7 +2874,7 @@ cleanup: * Return <0 for fatal error. */ static int submit_eb_page(struct page *page, struct writeback_control *wbc, - struct extent_page_data *epd, + struct btrfs_bio_ctrl *bio_ctrl, struct extent_buffer **eb_context) { struct address_space *mapping = page->mapping; @@ -2888,7 +2886,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return submit_eb_subpage(page, wbc, epd); + return submit_eb_subpage(page, wbc, bio_ctrl); spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { @@ -2931,7 +2929,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, *eb_context = eb; - ret = lock_extent_buffer_for_io(eb, epd); + ret = lock_extent_buffer_for_io(eb, bio_ctrl); if (ret <= 0) { btrfs_revert_meta_write_pointer(cache, eb); if (cache) @@ -2946,7 +2944,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } - ret = write_one_eb(eb, wbc, epd); + ret = write_one_eb(eb, wbc, bio_ctrl); free_extent_buffer(eb); if (ret < 0) return ret; @@ -2957,10 +2955,9 @@ int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc) { struct extent_buffer *eb_context = NULL; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info; int ret = 0; @@ -3003,7 +3000,7 @@ retry: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - ret = submit_eb_page(page, wbc, &epd, &eb_context); + ret = submit_eb_page(page, wbc, &bio_ctrl, &eb_context); if (ret == 0) continue; if (ret < 0) { @@ -3064,7 +3061,7 @@ retry: ret = 0; if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - submit_write_bio(&epd, ret); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_meta_io_unlock(fs_info); return ret; @@ -3073,9 +3070,9 @@ retry: /* * Walk the list of dirty pages of the given address space and write all of them. * - * @mapping: address space structure to write - * @wbc: subtract the number of written pages from *@wbc->nr_to_write - * @epd: holds context for the write, namely the bio + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @bio_ctrl: holds context for the write, namely the bio * * If a page is already under I/O, write_cache_pages() skips it, even * if it's dirty. This is desirable behaviour for memory-cleaning writeback, @@ -3087,7 +3084,7 @@ retry: */ static int extent_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, - struct extent_page_data *epd) + struct btrfs_bio_ctrl *bio_ctrl) { struct inode *inode = mapping->host; int ret = 0; @@ -3168,7 +3165,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); lock_page(page); } @@ -3179,7 +3176,7 @@ retry: if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); wait_on_page_writeback(page); } @@ -3189,7 +3186,7 @@ retry: continue; } - ret = __extent_writepage(page, wbc, epd); + ret = __extent_writepage(page, wbc, bio_ctrl); if (ret < 0) { done = 1; break; @@ -3219,7 +3216,7 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - submit_write_bio(epd, 0); + submit_write_bio(bio_ctrl, 0); goto retry; } @@ -3245,8 +3242,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) u64 cur = start; unsigned long nr_pages; const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 1, .sync_io = 1, }; @@ -3277,7 +3273,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) ASSERT(PageLocked(page)); ASSERT(PageDirty(page)); clear_page_dirty_for_io(page); - ret = __extent_writepage(page, &wbc_writepages, &epd); + ret = __extent_writepage(page, &wbc_writepages, &bio_ctrl); ASSERT(ret <= 0); if (ret < 0) { found_error = true; @@ -3287,7 +3283,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - submit_write_bio(&epd, found_error ? ret : 0); + submit_write_bio(&bio_ctrl, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -3300,10 +3296,9 @@ int extent_writepages(struct address_space *mapping, { struct inode *inode = mapping->host; int ret = 0; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, + struct btrfs_bio_ctrl bio_ctrl = { .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, + .sync_io = (wbc->sync_mode == WB_SYNC_ALL), }; /* @@ -3311,8 +3306,8 @@ int extent_writepages(struct address_space *mapping, * protect the write pointer updates. */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); - ret = extent_write_cache_pages(mapping, wbc, &epd); - submit_write_bio(&epd, ret); + ret = extent_write_cache_pages(mapping, wbc, &bio_ctrl); + submit_write_bio(&bio_ctrl, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } From 9c5ff9b42c1cb22823c94983b7d52121c559bf4d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:01 +0800 Subject: [PATCH 134/249] btrfs: raid56: extract the vertical stripe recovery code into recover_vertical() This refactor includes the following behavior change first: - Don't error out if only P/Q is corrupted The old code will directly error out if only P/Q is corrupted. Although it is an logical error if we go into rebuild path with only P/Q corrupted, there is no need to error out. Just skip the rebuild and return the already good data. Then comes the following refactor which shouldn't cause behavior changes: - Introduce a helper to do vertical stripe recovery This not only reduce one indent level, but also paves the road for later data checksum verification in RMW cycles. - Sort rbio->faila/b before recovery So we don't need to do the same swap every vertical stripe - Replace a BUG_ON() with ASSERT() Or checkpatch won't let me pass. - Mark recovered sectors uptodate after the recover loop - Do the cleanup for pointers unconditionally We only need to initialize @pointers and @unmap_array to NULL, so we can safely free them unconditionally. - Mark the repaired sector uptodate in recover_vertical() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 285 ++++++++++++++++++++++++---------------------- 1 file changed, 149 insertions(+), 136 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ef272e7d932c..34d51d2ef746 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1886,6 +1886,144 @@ fail: bio_endio(bio); } +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + const u32 sectorsize = fs_info->sectorsize; + const int faila = rbio->faila; + const int failb = rbio->failb; + int stripe_nr; + + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return; + + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + /* + * If we're rebuilding a read, we have to use + * pages from the bio list + */ + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && + (stripe_nr == faila || stripe_nr == failb)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + } + ASSERT(sector->page); + pointers[stripe_nr] = kmap_local_page(sector->page) + + sector->pgoff; + unmap_array[stripe_nr] = pointers[stripe_nr]; + } + + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) + /* + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. + */ + goto cleanup; + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below + */ + goto pstripe; + } + + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == + RAID5_P_STRIPE) + /* + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. + */ + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } + + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); + } else { + raid6_2data_recov(rbio->real_stripes, sectorsize, + faila, failb, pointers); + } + } else { + void *p; + + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); +pstripe: + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; + + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, sectorsize); + + } + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired in the vertical stripe, thus they are now + * uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. + */ + if (rbio->faila >= 0) { + sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); + sector->uptodate = 1; + } + if (rbio->failb >= 0) { + sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); + sector->uptodate = 1; + } + +cleanup: + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); +} + /* * all parity reconstruction happens here. We've read in everything * we can find from the drives and this does the heavy lifting of @@ -1893,13 +2031,10 @@ fail: */ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int sectornr, stripe; - void **pointers; - void **unmap_array; - int faila = -1, failb = -1; + int sectornr; + void **pointers = NULL; + void **unmap_array = NULL; blk_status_t err; - int i; /* * This array stores the pointer for each sector, thus it has the extra @@ -1908,7 +2043,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = BLK_STS_RESOURCE; - goto cleanup_io; + goto cleanup; } /* @@ -1918,11 +2053,12 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!unmap_array) { err = BLK_STS_RESOURCE; - goto cleanup_pointers; + goto cleanup; } - faila = rbio->faila; - failb = rbio->failb; + /* Make sure faila and fail b are in order. */ + if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb) + swap(rbio->faila, rbio->failb); if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { @@ -1933,138 +2069,15 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, &rbio->dbitmap)) - continue; - - /* - * Setup our array of pointers with sectors from each stripe - * - * NOTE: store a duplicate array of pointers to preserve the - * pointer order - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * If we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - ASSERT(sector->page); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - unmap_array[stripe] = pointers[stripe]; - } - - /* All raid6 handling here */ - if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* Single failure, rebuild from parity raid5 style */ - if (failb < 0) { - if (faila == rbio->nr_data) { - /* - * Just the P stripe has failed, without - * a bad data or Q stripe. - * TODO, we should redo the xor here. - */ - err = BLK_STS_IOERR; - goto cleanup; - } - /* - * a single failure in raid6 is rebuilt - * in the pstripe code below - */ - goto pstripe; - } - - /* make sure our ps and qs are in order */ - if (faila > failb) - swap(faila, failb); - - /* if the q stripe is failed, do a pstripe reconstruction - * from the xors. - * If both the q stripe and the P stripe are failed, we're - * here due to a crc mismatch and we can't give them the - * data they want - */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) { - err = BLK_STS_IOERR; - goto cleanup; - } - /* - * otherwise we have one bad data stripe and - * a good P stripe. raid5! - */ - goto pstripe; - } - - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, - sectorsize, faila, pointers); - } else { - raid6_2data_recov(rbio->real_stripes, - sectorsize, faila, failb, - pointers); - } - } else { - void *p; - - /* rebuild from P stripe here (raid5 or raid6) */ - BUG_ON(failb != -1); -pstripe: - /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); - - /* rearrange the pointer array */ - p = pointers[faila]; - for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) - pointers[stripe] = pointers[stripe + 1]; - pointers[rbio->nr_data - 1] = p; - - /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - } - - /* - * No matter if this is a RMW or recovery, we should have all - * failed sectors repaired, thus they are now uptodate. - * Especially if we determine to cache the rbio, we need to - * have at least all data sectors uptodate. - */ - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } - } - for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) - kunmap_local(unmap_array[stripe]); - } + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + recover_vertical(rbio, sectornr, pointers, unmap_array); err = BLK_STS_OK; + cleanup: kfree(unmap_array); -cleanup_pointers: kfree(pointers); -cleanup_io: /* * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a * valid rbio which is consistent with ondisk content, thus such a From 30e3c897f4a8d070d4fa079f6dade4b3e3cb74ad Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:02 +0800 Subject: [PATCH 135/249] btrfs: raid56: extract the pq generation code into a helper Currently finish_rmw() will update the P/Q stripes before submitting the writes. It's done behind a for(;;) loop, it's a little congested indent-wise, so extract the code into a helper called generate_pq_vertical(). Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 90 +++++++++++++++++++++++------------------------ 1 file changed, 44 insertions(+), 46 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 34d51d2ef746..66dd4e9752a9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1193,6 +1193,48 @@ not_found: trace_info->stripe_nr = -1; } +/* Generate PQ for one veritical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) +{ + void **pointers = rbio->finish_pointers; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct sector_ptr *sector; + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; + + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; + + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1204,28 +1246,17 @@ not_found: static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; - void **pointers = rbio->finish_pointers; - int nr_data = rbio->nr_data; /* The total sector number inside the full stripe. */ int total_sector_nr; int stripe; /* Sector number inside a stripe. */ int sectornr; - bool has_qstripe; struct bio_list bio_list; struct bio *bio; int ret; bio_list_init(&bio_list); - if (rbio->real_stripes - rbio->nr_data == 1) - has_qstripe = false; - else if (rbio->real_stripes - rbio->nr_data == 2) - has_qstripe = true; - else - BUG(); - /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); @@ -1258,41 +1289,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } - - /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; - - if (has_qstripe) { - /* - * RAID6, add the qstripe and call the library function - * to fill in our p/q - */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; - - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - for (stripe = stripe - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); /* * Start writing. Make bios for everything from the higher layers (the From d31968d9b6ac684e7e64be0d46eb0f5fe38d1841 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:03 +0800 Subject: [PATCH 136/249] btrfs: raid56: extract the recovery bio list build code into a helper This new helper will be also utilized in the incoming refactor of recovery path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 64 ++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 66dd4e9752a9..9234d3423b31 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2134,30 +2134,14 @@ static void raid_recover_end_io_work(struct work_struct *work) __raid_recover_end_io(rbio); } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - + ASSERT(bio_list_size(bio_list) == 0); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -2180,11 +2164,45 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + + return -EIO; +} + +/* + * reads everything we need off the disk to reconstruct + * the parity. endio handlers trigger final reconstruction + * when the IO is done. + * + * This is used both for reads from the higher layers and for + * parity construction required to finish a rmw cycle. + */ +static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->error, 0); + + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; bios_to_read = bio_list_size(&bio_list); if (!bios_to_read) { From ec936b0354e2d716e977e16165d188db044696b7 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:04 +0800 Subject: [PATCH 137/249] btrfs: raid56: extract sector recovery code into a helper This includes extra changes: - The allocation for unmap_array[] and pointers[] Now we allocate them in one go, and free them together. - Remove @err Use errno_to_blk_status(ret) instead. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 59 +++++++++++++++++++++++------------------------ 1 file changed, 29 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 9234d3423b31..da76e72383c3 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2022,36 +2022,24 @@ cleanup: kunmap_local(unmap_array[stripe_nr]); } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +static int recover_sectors(struct btrfs_raid_bio *rbio) { - int sectornr; void **pointers = NULL; void **unmap_array = NULL; - blk_status_t err; + int sectornr; + int ret = 0; /* - * This array stores the pointer for each sector, thus it has the extra - * pgoff value added from each sector + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!pointers) { - err = BLK_STS_RESOURCE; - goto cleanup; - } - - /* - * Store copy of pointers that does not get reordered during - * reconstruction so that kunmap_local works. - */ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!unmap_array) { - err = BLK_STS_RESOURCE; - goto cleanup; + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; } /* Make sure faila and fail b are in order. */ @@ -2070,11 +2058,22 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) recover_vertical(rbio, sectornr, pointers, unmap_array); - err = BLK_STS_OK; - -cleanup: - kfree(unmap_array); +out: kfree(pointers); + kfree(unmap_array); + return ret; +} + +/* + * all parity reconstruction happens here. We've read in everything + * we can find from the drives and this does the heavy lifting of + * sorting the good from the bad. + */ +static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +{ + int ret; + + ret = recover_sectors(rbio); /* * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a @@ -2098,13 +2097,13 @@ cleanup: * Cache this rbio iff the above read reconstruction is * executed without problems. */ - if (err == BLK_STS_OK && rbio->failb < 0) + if (!ret && rbio->failb < 0) cache_rbio_pages(rbio); else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - rbio_orig_end_io(rbio, err); - } else if (err == BLK_STS_OK) { + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } else if (!ret) { rbio->faila = -1; rbio->failb = -1; @@ -2115,7 +2114,7 @@ cleanup: else BUG(); } else { - rbio_orig_end_io(rbio, err); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } } From d817ce35d24a53c6736ac68e759ed83135ff7c3b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:05 +0800 Subject: [PATCH 138/249] btrfs: raid56: switch recovery path to a single function Currently btrfs uses end_io functions to jump between different stages of recovery. For example, we go the following different functions: - raid56_bio_end_io() This handles the read for all the sectors (except the missing device). - __raid_recover_end_io() This does the real work, it's called inside the delayed work function raid_recover_end_io_work(). This one recovery path involves at least 3 different functions, which is a big burden for readers. This patch will change the behavior by: - Introduce a unified recovery entrance, recover_rbio() - Use submit-and-wait method So the workflow is not interrupted by the endio function jump. This doesn't bring performance change, but reduce the burden for reviewers. - Run the main function in the rmw_workers workqueue Now raid56_parity_recover() only needs to setup the work, and queue the work using start_async_work(). Now readers only need to do one function jump (start_async_work()) to find out the main entrance of recovery path. Furthermore, recover_rbio() function can easily be reused by other paths. The old recovery path is still utilized by degraded write path. It will be cleaned up when we have migrated the write path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 144 +++++++++++++++++++++++++++++++++++++--------- fs/btrfs/raid56.h | 2 + 2 files changed, 119 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index da76e72383c3..2dd87ceda9db 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -67,7 +67,6 @@ struct sector_ptr { static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_work(struct work_struct *work); -static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); @@ -752,6 +751,8 @@ out: return ret; } +static void recover_rbio_work_locked(struct work_struct *work); + /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started @@ -809,10 +810,10 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock_irqrestore(&h->lock, flags); if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { steal_rbio(rbio, next); - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); start_async_work(next, rmw_work); @@ -989,6 +990,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, } bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); @@ -1519,6 +1521,39 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } +static void raid_wait_read_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + + if (bio->bi_status) + fail_bio_stripe(rbio, bio); + else + set_bio_pages_uptodate(rbio, bio); + + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; + + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; @@ -2176,6 +2211,79 @@ error: return -EIO; } +static int recover_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + /* + * Either we're doing recover for a read failure or degraded write, + * caller should have set faila/b correctly. + */ + ASSERT(rbio->faila >= 0 || rbio->failb >= 0); + bio_list_init(&bio_list); + + /* + * Reset error to 0, as we will later increase error for missing + * devices. + */ + atomic_set(&rbio->error, 0); + + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; + + index_rbio_pages(rbio); + + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We have more errors than our tolerance during the read. */ + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + + ret = recover_sectors(rbio); + +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void recover_rbio_work_locked(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + /* * reads everything we need off the disk to reconstruct * the parity. endio handlers trigger final reconstruction @@ -2264,7 +2372,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); - goto out_end_bio; + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; @@ -2278,7 +2387,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, (u64)bio->bi_iter.bi_size, bioc->map_type); free_raid_bio(rbio); bio->bi_status = BLK_STS_IOERR; - goto out_end_bio; + bio_endio(bio); + return; } /* @@ -2298,18 +2408,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - if (lock_stripe_add(rbio)) - return; - - /* - * This adds our rbio to the list of rbios that will be handled after - * the current lock owner is done. - */ - __raid56_parity_recover(rbio); - return; - -out_end_bio: - bio_endio(bio); + start_async_work(rbio, recover_rbio_work); } static void rmw_work(struct work_struct *work) @@ -2320,14 +2419,6 @@ static void rmw_work(struct work_struct *work) raid56_rmw_stripe(rbio); } -static void read_rebuild_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio; - - rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); -} - /* * The following code is used to scrub/replace the parity stripe * @@ -2818,6 +2909,5 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { - if (!lock_stripe_add(rbio)) - start_async_work(rbio, read_rebuild_work); + start_async_work(rbio, recover_rbio_work); } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 91d5c0adad15..445e833fcfcf 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -95,6 +95,8 @@ struct btrfs_raid_bio { atomic_t error; + wait_queue_head_t io_wait; + struct work_struct end_io_work; /* Bitmap to record which horizontal stripe has data */ From 509c27aa2fb69b4475d05d0ac5c5c4206e28d5d6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:06 +0800 Subject: [PATCH 139/249] btrfs: raid56: extract the rmw bio list build code into a helper The helper will later be used to refactor the whole RMW path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 56 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 2dd87ceda9db..88e2204c9f28 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1595,28 +1595,16 @@ static void raid56_rmw_end_io_work(struct work_struct *work) validate_rbio_for_rmw(rbio); } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); + ASSERT(bio_list_size(bio_list) == 0); - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - index_rbio_pages(rbio); - - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing data sectors. */ for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; total_sector_nr++) { @@ -1641,11 +1629,43 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) goto cleanup; } + return 0; + +cleanup: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} + +/* + * the stripe must be locked by the caller. It will + * unlock after all the writes are done + */ +static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_pages(rbio); + if (ret) + goto cleanup; + + index_rbio_pages(rbio); + + atomic_set(&rbio->error, 0); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; bios_to_read = bio_list_size(&bio_list); if (!bios_to_read) { From 6486d21c99cb46666c11632e4d595568863b965c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:07 +0800 Subject: [PATCH 140/249] btrfs: raid56: extract rwm write bios assembly into a helper The helper will be later used to refactor the rmw write path. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 165 ++++++++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 71 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 88e2204c9f28..5e5bfc2c21fc 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1237,6 +1237,97 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) kunmap_local(pointers[stripe]); } +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + /* The total sector number inside the full stripe. */ + int total_sector_nr; + int sectornr; + int stripe; + int ret; + + ASSERT(bio_list_size(bio_list) == 0); + + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + + /* + * Start assembly. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); + } + + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto error; + } + + if (likely(!rbio->bioc->num_tgtdevs)) + return 0; + + /* Make a copy for the replace target device. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + if (!rbio->bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; + continue; + } + + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); + } + + ret = rbio_add_io_sector(rbio, bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, REQ_OP_WRITE); + if (ret) + goto error; + } + + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return -EIO; +} + /* * this is called from one of two situations. We either * have a full stripe from the higher layers, or we've read all @@ -1247,10 +1338,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) */ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { - struct btrfs_io_context *bioc = rbio->bioc; /* The total sector number inside the full stripe. */ - int total_sector_nr; - int stripe; /* Sector number inside a stripe. */ int sectornr; struct bio_list bio_list; @@ -1294,75 +1382,10 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) generate_pq_vertical(rbio, sectornr); - /* - * Start writing. Make bios for everything from the higher layers (the - * bio_list in our rbio) and our P/Q. Ignore everything else. - */ - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; - stripe = total_sector_nr / rbio->stripe_nsectors; - sectornr = total_sector_nr % rbio->stripe_nsectors; - - /* This vertical stripe has no data, skip it. */ - if (!test_bit(sectornr, &rbio->dbitmap)) - continue; - - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, - sectornr, REQ_OP_WRITE); - if (ret) - goto cleanup; - } - - if (likely(!bioc->num_tgtdevs)) - goto write_data; - - for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - - stripe = total_sector_nr / rbio->stripe_nsectors; - sectornr = total_sector_nr % rbio->stripe_nsectors; - - if (!bioc->tgtdev_map[stripe]) { - /* - * We can skip the whole stripe completely, note - * total_sector_nr will be increased by one anyway. - */ - ASSERT(sectornr == 0); - total_sector_nr += rbio->stripe_nsectors - 1; - continue; - } - - /* This vertical stripe has no data, skip it. */ - if (!test_bit(sectornr, &rbio->dbitmap)) - continue; - - if (stripe < rbio->nr_data) { - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (!sector) - continue; - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - - ret = rbio_add_io_sector(rbio, &bio_list, sector, - rbio->bioc->tgtdev_map[stripe], - sectornr, REQ_OP_WRITE); - if (ret) - goto cleanup; - } - -write_data: atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); BUG_ON(atomic_read(&rbio->stripes_pending) == 0); From 5eb30ee26fa4dbd2d31f50ee3b4212933f86cb57 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:08 +0800 Subject: [PATCH 141/249] btrfs: raid56: introduce the main entrance for RMW path The new entrance will be called rmw_rbio(), it will have a streamlined workflow by using submit-and-wait method. Thus there will be no weird jumps between tons of functions, thus way more reader friendly, and will make later expansion easier, as it's now a straight workflow, the timing is way more clear. Unfortunately we can not yet migrate the RMW path to use this new entrance as we still need extra work to address the plug and unlock_stripe() function. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 161 ++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/raid56.h | 5 ++ 2 files changed, 166 insertions(+) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5e5bfc2c21fc..8fd633f01d9e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1252,6 +1252,14 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* + * Reset errors, as we may have errors inherited from from degraded + * write. + */ + atomic_set(&rbio->error, 0); + rbio->faila = -1; + rbio->failb = -1; + /* * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. @@ -1665,6 +1673,19 @@ cleanup: return ret; } +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) +{ + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; + + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + + index_stripe_sectors(rbio); + return 0; +} + /* * the stripe must be locked by the caller. It will * unlock after all the writes are done @@ -2454,6 +2475,146 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, start_async_work(rbio, recover_rbio_work); } +static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + atomic_set(&rbio->error, 0); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + return ret; +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + blk_status_t err = bio->bi_status; + + if (err) + fail_bio_stripe(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +int rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; + + /* + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. + */ + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + return ret; + + /* Full stripe write, can write the full stripe right now. */ + if (rbio_is_full(rbio)) + goto write; + /* + * Now we're doing sub-stripe write, also need all data stripes to do + * the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return ret; + + atomic_set(&rbio->error, 0); + index_rbio_pages(rbio); + + ret = rmw_read_and_wait(rbio); + if (ret < 0) + return ret; + + /* Too many read errors, beyond our tolerance. */ + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) + return ret; + + /* Have read failures but under tolerance, needs recovery. */ + if (rbio->faila >= 0 || rbio->failb >= 0) { + ret = recover_rbio(rbio); + if (ret < 0) + return ret; + } +write: + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + atomic_set(&rbio->error, 0); + + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + return ret; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We have more errors than our tolerance during the read. */ + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) + ret = -EIO; + return ret; +} + static void rmw_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 445e833fcfcf..0e77c77c5dba 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -185,4 +185,9 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); +/* + * Placeholder definition to avoid warning, will be removed when + * the full write path is migrated. + */ +int rmw_rbio(struct btrfs_raid_bio *rbio); #endif From 93723095b5d54b923abf07459998bcb9bbac8ba6 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:09 +0800 Subject: [PATCH 142/249] btrfs: raid56: switch write path to rmw_rbio() This includes the following changes: - Implement new raid_unplug() functions Now we don't need a workqueue to run the plug, as all our work is just queue rmw_rbio_work() call, which can be executed without sleep. - Implement a rmw_rbio_work_locked() helper This is for unlock_stripe(), which is already holding the full stripe lock. - Remove all the old functions This should already shows how complex the old functions are, as we ended up removing the following functions: * rmw_work() * validate_rbio_for_rmw() * raid56_rmw_end_io_work() * raid56_rmw_stripe() * full_stripe_write() * partial_stripe_write() * __raid56_parity_write() * run_plug() * unplug_work() * btrfs_raid_unplug() * rmw_work() * __raid56_parity_recover() * raid_recover_end_io_work() - Unexport rmw_rbio() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 350 ++++++---------------------------------------- fs/btrfs/raid56.h | 5 - 2 files changed, 42 insertions(+), 313 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8fd633f01d9e..ffedbfde95e0 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -64,9 +64,9 @@ struct sector_ptr { unsigned int uptodate:8; }; -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct work_struct *work); +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); @@ -816,7 +816,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - start_async_work(next, rmw_work); + start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); start_async_work(next, scrub_parity_work); @@ -1108,23 +1108,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, return 0; } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) -{ - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); - } -} - static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1601,31 +1584,6 @@ static void raid56_bio_end_io(struct bio *bio) &rbio->end_io_work); } -/* - * End io handler for the read phase of the RMW cycle. All the bios here are - * physical stripe bios we've read from the disk so we can recalculate the - * parity of the stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_rmw_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - rbio_orig_end_io(rbio, BLK_STS_IOERR); - return; - } - - /* - * This will normally call finish_rmw to start our write but if there - * are any failed stripes we'll reconstruct from parity first. - */ - validate_rbio_for_rmw(rbio); -} - static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -1686,122 +1644,6 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) return 0; } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) -{ - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - struct bio *bio; - - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - index_rbio_pages(rbio); - - atomic_set(&rbio->error, 0); - - ret = rmw_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto cleanup; - - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } - - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; - - if (trace_raid56_read_partial_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_read_partial(rbio, bio, &trace_info); - } - submit_bio(bio); - } - /* the actual write will happen once the reads are done */ - return 0; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return -EIO; - -finish: - validate_rbio_for_rmw(rbio); - return 0; -} - -/* - * if the upper layers pass in a full stripe, we thank them by only allocating - * enough pages to hold the parity, and sending it all down quickly. - */ -static int full_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = alloc_rbio_parity_pages(rbio); - if (ret) - return ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - finish_rmw(rbio); - return 0; -} - -/* - * partial stripe writes get handed over to async helpers. - * We're really hoping to merge a few more writes into this - * rbio before calculating new parity - */ -static int partial_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - start_async_work(rbio, rmw_work); - return 0; -} - -/* - * sometimes while we were reading from the drive to - * recalculate parity, enough new bios come into create - * a full stripe. So we do a check here to see if we can - * go directly to finish_rmw - */ -static int __raid56_parity_write(struct btrfs_raid_bio *rbio) -{ - /* head off into rmw land if we don't have a full stripe */ - if (!rbio_is_full(rbio)) - return partial_stripe_write(rbio); - return full_stripe_write(rbio); -} - /* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged @@ -1836,28 +1678,22 @@ static int plug_cmp(void *priv, const struct list_head *a, return 0; } -static void run_plug(struct btrfs_plug_cb *plug) +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; - /* - * sort our plug list then try to merge - * everything we can in hopes of creating full - * stripes. - */ list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { - int ret; - - /* we have a full stripe, send it down */ - ret = full_stripe_write(cur); - BUG_ON(ret); + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); continue; } if (last) { @@ -1865,42 +1701,16 @@ static void run_plug(struct btrfs_plug_cb *plug) merge_rbio(last, cur); free_raid_bio(cur); continue; - } - __raid56_parity_write(last); + start_async_work(last, rmw_rbio_work); } last = cur; } - if (last) { - __raid56_parity_write(last); - } + if (last) + start_async_work(last, rmw_rbio_work); kfree(plug); } -/* - * if the unplug comes from schedule, we have to push the - * work off to a helper thread - */ -static void unplug_work(struct work_struct *work) -{ - struct btrfs_plug_cb *plug; - plug = container_of(work, struct btrfs_plug_cb, work); - run_plug(plug); -} - -static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct btrfs_plug_cb *plug; - plug = container_of(cb, struct btrfs_plug_cb, cb); - - if (from_schedule) { - INIT_WORK(&plug->work, unplug_work); - queue_work(plug->info->rmw_workers, &plug->work); - return; - } - run_plug(plug); -} - /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { @@ -1948,19 +1758,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) rbio_add_bio(rbio, bio); /* - * don't plug on full rbios, just get them out the door + * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) { - ret = full_stripe_write(rbio); - if (ret) { - free_raid_bio(rbio); - goto fail; - } - return; - } + if (rbio_is_full(rbio)) + goto queue_rbio; - cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { @@ -1968,13 +1772,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - } else { - ret = __raid56_parity_write(rbio); - if (ret) { - free_raid_bio(rbio); - goto fail; - } + return; } +queue_rbio: + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * can queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); return; @@ -2217,21 +2022,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } } -/* - * This is called only for stripes we've read from disk to reconstruct the - * parity. - */ -static void raid_recover_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - else - __raid_recover_end_io(rbio); -} - static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -2348,79 +2138,6 @@ static void recover_rbio_work_locked(struct work_struct *work) rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) -{ - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - struct bio *bio; - - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - - ret = recover_assemble_read_bios(rbio, &bio_list); - if (ret < 0) - goto cleanup; - - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * we might have no bios to read just because the pages - * were up to date, or we might have no bios to read because - * the devices were gone. - */ - if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { - __raid_recover_end_io(rbio); - return 0; - } else { - goto cleanup; - } - } - - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; - - if (trace_raid56_scrub_read_recover_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); - } - submit_bio(bio); - } - - return 0; - -cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); - - return -EIO; -} - /* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, @@ -2529,7 +2246,7 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, } } -int rmw_rbio(struct btrfs_raid_bio *rbio) +static int rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; int sectornr; @@ -2615,12 +2332,29 @@ write: return ret; } -static void rmw_work(struct work_struct *work) +static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_rmw_stripe(rbio); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void rmw_rbio_work_locked(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } /* diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 0e77c77c5dba..445e833fcfcf 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -185,9 +185,4 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info); -/* - * Placeholder definition to avoid warning, will be removed when - * the full write path is migrated. - */ -int rmw_rbio(struct btrfs_raid_bio *rbio); #endif From cb3450b7d7d0af6ed6ff60e174129938914083ab Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:10 +0800 Subject: [PATCH 143/249] btrfs: raid56: extract scrub read bio list assembly code into a helper Just like what we did for write/recovery, also extract the read bio assembly code into a helper for scrub. The difference between the three are: - rmw_assemble_read_bios() only submit reads for missing sectors Thus it will skip cached sectors, but will also read sectors which is not covered by any full stripe. (For cache usage) - recover_assemble_read_bios() reads every sector which has not failed - scrub_assemble_read_bios() has extra check for vertical stripes It's mostly the same as rmw_assemble_read_bios(), but will skip sectors which is not covered by a vertical stripe. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 44 +++++++++++++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index ffedbfde95e0..fcac70ff5a78 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -2707,21 +2707,15 @@ static void raid56_parity_scrub_end_io_work(struct work_struct *work) validate_rbio_for_parity_scrub(rbio); } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); + ASSERT(bio_list_size(bio_list) == 0); - ret = alloc_rbio_essential_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2750,11 +2744,35 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} + +static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +{ + int bios_to_read = 0; + struct bio_list bio_list; + int ret; + struct bio *bio; + + bio_list_init(&bio_list); + + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; + + atomic_set(&rbio->error, 0); + ret = scrub_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; bios_to_read = bio_list_size(&bio_list); if (!bios_to_read) { From 6bfd0133bee27737db415c530617cb015274d21f Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:11 +0800 Subject: [PATCH 144/249] btrfs: raid56: switch scrub path to use a single function This switch involves the following changes: - Make finish_parity_scrub() only to submit the write bios It will no longer call rbio_orig_end_io(), and now it will return error. - Add a new helper, recover_scrub_rbio(), to handle recovery It's just doing extra scrub related checks, and then call recover_sectors(). - Rename raid56_parity_scrub_stripe() to scrub_rbio() - Rename scrub_parity_work() to scrub_rbio_work_locked() To follow the existing naming scheme. - Delete unused functions Including: * finish_rmw() * raid_write_end_io() * raid56_bio_end_io() * __raid_recover_end_io() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 401 ++++++++++------------------------------------ 1 file changed, 81 insertions(+), 320 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fcac70ff5a78..629237102189 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -64,7 +64,6 @@ struct sector_ptr { unsigned int uptodate:8; }; -static noinline void finish_rmw(struct btrfs_raid_bio *rbio); static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); @@ -72,9 +71,8 @@ static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check); -static void scrub_parity_work(struct work_struct *work); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { @@ -819,7 +817,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - start_async_work(next, scrub_parity_work); + start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; @@ -880,35 +878,6 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) rbio_endio_bio_list(extra, err); } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - int max_errors; - - if (err) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = BLK_STS_OK; - - /* OK, we have read all the stripes we need to. */ - max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bioc->max_errors; - if (atomic_read(&rbio->error) > max_errors) - err = BLK_STS_IOERR; - - rbio_orig_end_io(rbio, err); -} - /* * Get a sector pointer specified by its @stripe_nr and @sector_nr. * @@ -1319,87 +1288,6 @@ error: return -EIO; } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) -{ - /* The total sector number inside the full stripe. */ - /* Sector number inside a stripe. */ - int sectornr; - struct bio_list bio_list; - struct bio *bio; - int ret; - - bio_list_init(&bio_list); - - /* We should have at least one data sector. */ - ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); - - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. - */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); - - /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. - */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) - generate_pq_vertical(rbio, sectornr); - - ret = rmw_assemble_write_bios(rbio, &bio_list); - if (ret < 0) - goto cleanup; - - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); -} - /* * helper to find the stripe number for a given bio. Used to figure out which * stripe has failed. This expects the bio to correspond to a physical disk, @@ -1568,22 +1456,6 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, } } -static void raid56_bio_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); - - bio_put(bio); - - if (atomic_dec_and_test(&rbio->stripes_pending)) - queue_work(rbio->bioc->fs_info->endio_raid56_workers, - &rbio->end_io_work); -} - static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -1968,60 +1840,6 @@ out: return ret; } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = recover_sectors(rbio); - - /* - * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a - * valid rbio which is consistent with ondisk content, thus such a - * valid rbio can be cached to avoid further disk reads. - */ - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - /* - * - In case of two failures, where rbio->failb != -1: - * - * Do not cache this rbio since the above read reconstruction - * (raid6_datap_recov() or raid6_2data_recov()) may have - * changed some content of stripes which are not identical to - * on-disk content any more, otherwise, a later write/recover - * may steal stripe_pages from this rbio and end up with - * corruptions or rebuild failures. - * - * - In case of single failure, where rbio->failb == -1: - * - * Cache this rbio iff the above read reconstruction is - * executed without problems. - */ - if (!ret && rbio->failb < 0) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } else if (!ret) { - rbio->faila = -1; - rbio->failb = -1; - - if (rbio->operation == BTRFS_RBIO_WRITE) - finish_rmw(rbio); - else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) - finish_parity_scrub(rbio, 0); - else - BUG(); - } else { - rbio_orig_end_io(rbio, errno_to_blk_status(ret)); - } -} - static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { @@ -2449,8 +2267,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2493,7 +2310,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) - goto cleanup; + return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; @@ -2503,7 +2320,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; - goto cleanup; + return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; @@ -2590,33 +2407,13 @@ writeback: } submit_write: - nr_data = bio_list_size(&bio_list); - if (!nr_data) { - /* Every parity is right */ - rbio_orig_end_io(rbio, BLK_STS_OK); - return; - } - - atomic_set(&rbio->stripes_pending, nr_data); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_scrub_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; + submit_write_bios(rbio, &bio_list); + return 0; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); + return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2626,85 +2423,51 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) return 0; } -/* - * While we're doing the parity check and repair, we could have errors - * in reading pages off the disk. This checks for errors and if we're - * not able to read the page it'll trigger parity reconstruction. The - * parity scrub will be finished after we've reconstructed the failed - * stripes - */ -static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; + int dfail = 0, failp = -1; + int ret; - if (rbio->faila >= 0 || rbio->failb >= 0) { - int dfail = 0, failp = -1; + /* No error case should be already handled by the caller. */ + ASSERT(rbio->faila >= 0 || rbio->failb >= 0); - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; + if (is_data_stripe(rbio, rbio->faila)) + dfail++; + else if (is_parity_stripe(rbio->faila)) + failp = rbio->faila; - if (is_data_stripe(rbio, rbio->failb)) - dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; - - /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) - */ - if (dfail > rbio->bioc->max_errors - 1) - goto cleanup; - - /* - * If all data is good, only parity is correctly, just - * repair the parity. - */ - if (dfail == 0) { - finish_parity_scrub(rbio, 0); - return; - } - - /* - * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. - */ - if (failp != rbio->scrubp) - goto cleanup; - - __raid_recover_end_io(rbio); - } else { - finish_parity_scrub(rbio, 1); - } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); -} - -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_parity_scrub_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); + if (is_data_stripe(rbio, rbio->failb)) + dfail++; + else if (is_parity_stripe(rbio->failb)) + failp = rbio->failb; /* - * This will normally call finish_rmw to start our write, but if there - * are any failed stripes we'll reconstruct from parity first + * Because we can not use a scrubbing parity to repair + * the data, so the capability of the repair is declined. + * (In the case of RAID5, we can not repair anything) */ - validate_rbio_for_parity_scrub(rbio); + if (dfail > rbio->bioc->max_errors - 1) + return -EIO; + + /* + * If all data is good, only parity is correctly, just + * repair the parity. + */ + if (dfail == 0) + return 0; + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity + * is scrubbing parity, luckily, use the other one to repair + * the data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) + return -EIO; + + /* We have some corrupted sectors, need to repair them. */ + ret = recover_sectors(rbio); + return ret; } static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, @@ -2756,9 +2519,9 @@ error: return ret; } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_rbio(struct btrfs_raid_bio *rbio) { - int bios_to_read = 0; + bool need_check = false; struct bio_list bio_list; int ret; struct bio *bio; @@ -2774,61 +2537,59 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (ret < 0) goto cleanup; - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + ret = -EIO; + goto cleanup; + } + /* + * No error during read, can finish the scrub and need to verify the + * P/Q sectors; + */ + if (atomic_read(&rbio->error) == 0) { + need_check = true; goto finish; } + /* We have some failures, need to recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto cleanup; + +finish: /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; - - if (trace_raid56_scrub_read_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read(rbio, bio, &trace_info); - } - submit_bio(bio); - } - /* the actual write will happen once the reads are done */ - return; + ret = finish_parity_scrub(rbio, need_check); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) + ret = -EIO; + return ret; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return; - -finish: - validate_rbio_for_parity_scrub(rbio); + return ret; } -static void scrub_parity_work(struct work_struct *work) +static void scrub_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_parity_scrub_stripe(rbio); + ret = scrub_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - start_async_work(rbio, scrub_parity_work); + start_async_work(rbio, scrub_rbio_work_locked); } /* The following code is used for dev replace of a missing RAID 5/6 device. */ From 1a1a285139707f2430b3833103e152f07c66e63e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Nov 2022 19:16:12 +0800 Subject: [PATCH 145/249] btrfs: remove the unused endio_raid56_workers and btrfs_raid_bio::end_io_work Since we have switched all raid56 workload to submit-and-wait method, there is no use for btrfs_fs_info::endio_raid56_workers workqueue and btrfs_raid_bio::end_io_work. Remove them to save some memory. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 +----- fs/btrfs/fs.h | 1 - fs/btrfs/raid56.h | 2 -- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f5f793af12a0..e5687bdff150 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2097,8 +2097,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->workers); if (fs_info->endio_workers) destroy_workqueue(fs_info->endio_workers); - if (fs_info->endio_raid56_workers) - destroy_workqueue(fs_info->endio_raid56_workers); if (fs_info->rmw_workers) destroy_workqueue(fs_info->rmw_workers); if (fs_info->compressed_write_workers) @@ -2304,8 +2302,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = alloc_workqueue("btrfs-endio-meta", flags, max_active); - fs_info->endio_raid56_workers = - alloc_workqueue("btrfs-endio-raid56", flags, max_active); fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, @@ -2327,7 +2323,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->compressed_write_workers && - fs_info->endio_write_workers && fs_info->endio_raid56_workers && + fs_info->endio_write_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers && diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index c7f2a512fba2..a749367e5ae2 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -532,7 +532,6 @@ struct btrfs_fs_info { struct btrfs_workqueue *flush_workers; struct workqueue_struct *endio_workers; struct workqueue_struct *endio_meta_workers; - struct workqueue_struct *endio_raid56_workers; struct workqueue_struct *rmw_workers; struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 445e833fcfcf..9fae97b7a2a5 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -97,8 +97,6 @@ struct btrfs_raid_bio { wait_queue_head_t io_wait; - struct work_struct end_io_work; - /* Bitmap to record which horizontal stripe has data */ unsigned long dbitmap; From 61ce908a3c260fbe37826a6cbd56636abeffcd28 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:41 +0000 Subject: [PATCH 146/249] btrfs: send: avoid unnecessary path allocations when finding extent clone When looking for an extent clone, at find_extent_clone(), we start by allocating a path and then check for cases where we can't have clones and exit immediately in those cases. It's a waste of time to allocate the path before those cases, so reorder the logic so that we check for those cases before allocating the path. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3befc0d2d866..d1388d5dc951 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1365,40 +1365,35 @@ static int find_extent_clone(struct send_ctx *sctx, int compressed; u32 i; - tmp_path = alloc_path_for_send(); - if (!tmp_path) - return -ENOMEM; - - /* We only use this path under the commit sem */ - tmp_path->need_commit_sem = 0; - if (data_offset >= ino_size) { /* * There may be extents that lie behind the file's size. * I at least had this in combination with snapshotting while * writing large files. */ - ret = 0; - goto out; + return 0; } - fi = btrfs_item_ptr(eb, path->slots[0], - struct btrfs_file_extent_item); + fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(eb, fi); - if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - ret = -ENOENT; - goto out; - } - compressed = btrfs_file_extent_compression(eb, fi); + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + return -ENOENT; - num_bytes = btrfs_file_extent_num_bytes(eb, fi); disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte == 0) { - ret = -ENOENT; - goto out; - } + if (disk_byte == 0) + return -ENOENT; + + compressed = btrfs_file_extent_compression(eb, fi); + num_bytes = btrfs_file_extent_num_bytes(eb, fi); logical = disk_byte + btrfs_file_extent_offset(eb, fi); + tmp_path = alloc_path_for_send(); + if (!tmp_path) + return -ENOMEM; + + /* We only use this path under the commit sem */ + tmp_path->need_commit_sem = 0; + down_read(&fs_info->commit_root_sem); ret = extent_from_logical(fs_info, disk_byte, tmp_path, &found_key, &flags); From d3f41317f0fedac6b0a3de68b3dab66b71923dd8 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:42 +0000 Subject: [PATCH 147/249] btrfs: send: update comment at find_extent_clone() We have this unclear comment at find_extent_clone() about extents starting at a file offset greater than or equals to the i_size of the inode. It's not really informative and it's misleading, since it mentions the author found such extents with snapshots and large files. Such extents are a result of fallocate with FALLOC_FL_KEEP_SIZE and there is no relation to snapshots or large files (all write paths update the i_size before inserting a new file extent item). So update the comment to be precise about it and why we don't bother looking for clone sources in that case. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d1388d5dc951..4ce5c154f6d7 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1365,14 +1365,14 @@ static int find_extent_clone(struct send_ctx *sctx, int compressed; u32 i; - if (data_offset >= ino_size) { - /* - * There may be extents that lie behind the file's size. - * I at least had this in combination with snapshotting while - * writing large files. - */ + /* + * With fallocate we can get prealloc extents beyond the inode's i_size, + * so we don't do anything here because clone operations can not clone + * to a range beyond i_size without increasing the i_size of the + * destination inode. + */ + if (data_offset >= ino_size) return 0; - } fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(eb, fi); From 344174a1a68a55599d3a264db47d96583ff66473 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:43 +0000 Subject: [PATCH 148/249] btrfs: send: drop unnecessary backref context field initializations At find_extent_clone() we are initializing to zero the 'found_itself' and 'found' fields of the backref context before we use it but we have already initialized the structure to zeroes when we declared it on stack, so it's pointless to initialize those fields and they are unnecessarily increasing the object text size with two "mov" instructions (x86_64). Similarly make the 'extent_len' initialization more clear by using an if- -then-else instead of a double assignment to it in case the extent's end crosses the i_size boundary. Before this change: $ size fs/btrfs/send.o text data bss dec hex filename 68694 4252 16 72962 11d02 fs/btrfs/send.o After this change: $ size fs/btrfs/send.o text data bss dec hex filename 68678 4252 16 72946 11cf2 fs/btrfs/send.o Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 4ce5c154f6d7..7d289bdc6de1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1432,11 +1432,8 @@ static int find_extent_clone(struct send_ctx *sctx, } backref_ctx.sctx = sctx; - backref_ctx.found = 0; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; - backref_ctx.found_itself = 0; - backref_ctx.extent_len = num_bytes; /* * The last extent of a file may be too large due to page alignment. @@ -1445,6 +1442,8 @@ static int find_extent_clone(struct send_ctx *sctx, */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; + else + backref_ctx.extent_len = num_bytes; /* * Now collect all backrefs. From 22a3c0ac8ed0043af209a15928ae4c4855b0a4c4 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:44 +0000 Subject: [PATCH 149/249] btrfs: send: avoid unnecessary backref lookups when finding clone source At find_extent_clone(), unless we are given an inline extent, a file extent item that represents hole or an extent that starts beyond the i_size, we always do backref walking to look for clone sources, unless if we have more than SEND_MAX_EXTENT_REFS (64) known references on the extent. However if we know we only have one reference in the extent item and only one clone source (the send root), then it's pointless to do the backref walking to search for clone sources, as we can't clone from any other root. So skip the backref walking in that case. The following test was run on a non-debug kernel (Debian's default kernel config): $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT # Create an extent tree that's not too small and none of the # extents is shared. for ((i = 1; i <= 50000; i++)); do xfs_io -f -c "pwrite 0 4K" $MNT/file_$i > /dev/null echo -ne "\r$i files created..." done echo btrfs subvolume snapshot -r $MNT $MNT/snap start=$(date +%s%N) btrfs send $MNT/snap > /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo -e "\nsend took $dur milliseconds" umount $MNT Before this change: send took 5389 milliseconds After this change: send took 4519 milliseconds (-16.1%) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7d289bdc6de1..d9e01b66b32b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1354,6 +1354,7 @@ static int find_extent_clone(struct send_ctx *sctx, u64 disk_byte; u64 num_bytes; u64 extent_item_pos; + u64 extent_refs; u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; @@ -1408,14 +1409,22 @@ static int find_extent_clone(struct send_ctx *sctx, ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], struct btrfs_extent_item); + extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei); /* * Backreference walking (iterate_extent_inodes() below) is currently * too expensive when an extent has a large number of references, both * in time spent and used memory. So for now just fallback to write * operations instead of clone operations when an extent has more than * a certain amount of references. + * + * Also, if we have only one reference and only the send root as a clone + * source - meaning no clone roots were given in the struct + * btrfs_ioctl_send_args passed to the send ioctl - then it's our + * reference and there's no point in doing backref walking which is + * expensive, so exit early. */ - if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) { + if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) || + extent_refs > SEND_MAX_EXTENT_REFS) { ret = -ENOENT; goto out; } From c7499a64dcf62bf27968b0e9cce353c490b9ada1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:45 +0000 Subject: [PATCH 150/249] btrfs: send: optimize clone detection to increase extent sharing Currently send does not do the best decisions when it comes to decide between multiple clone sources, which results in clone operations for partial extent ranges, which has the following disadvantages: 1) We get less shared extents at the destination; 2) We have to read more data during the send operation and emit more write commands. Besides not being optimal behaviour, it also breaks user expectations and is often reported by users, with a recent example in the Link tag at the bottom of this change log. Part of the reason for this non-optimal behaviour is that the backref walking code does not provide information about the length of the file extent items that were found for each backref, so send is blind about which backref is the best to chose as a cloning source. The other existing reasons are just silliness, namely always prefering the inode with the lowest number when multiple are found for the same root and when we can clone from multiple roots, always prefer the send root over any of the other clone roots. This does not make any sense since any inode or root is fine and as good as any other inode/root. Fix this by making backref walking pass information about the number of bytes referenced by each file extent item and then have send's backref callback pick the inode with the highest number of bytes for each root. Finally select the root from which we can clone more bytes from. Example reproducer: $ cat test.sh #!/bin/bash DEV=/dev/sdi MNT=/mnt/sdi mkfs.btrfs -f $DEV mount $DEV $MNT xfs_io -f -c "pwrite -S 0xab -b 2M 0 2M" $MNT/foo cp --reflink=always $MNT/foo $MNT/bar cp --reflink=always $MNT/foo $MNT/baz sync # Overwrite the second half of file foo. xfs_io -c "pwrite -S 0xcd -b 1M 1M 1M" $MNT/foo sync echo echo "*** fiemap in the original filesystem ***" echo xfs_io -c "fiemap -v" $MNT/foo xfs_io -c "fiemap -v" $MNT/bar xfs_io -c "fiemap -v" $MNT/baz echo btrfs filesystem du $MNT btrfs subvolume snapshot -r $MNT $MNT/snap btrfs send -f /tmp/send_stream $MNT/snap umount $MNT mkfs.btrfs -f $DEV &> /dev/null mount $DEV $MNT btrfs receive -f /tmp/send_stream $MNT echo echo "*** fiemap in the new filesystem ***" echo xfs_io -r -c "fiemap -v" $MNT/snap/foo xfs_io -r -c "fiemap -v" $MNT/snap/bar xfs_io -r -c "fiemap -v" $MNT/snap/baz echo btrfs filesystem du $MNT rm -f /tmp/send_stream rm -f /tmp/snap.fssum umount $MNT Before this change: $ ./test.sh (...) *** fiemap in the original filesystem *** /mnt/sdi/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 Total Exclusive Set shared Filename 2.00MiB 1.00MiB - /mnt/sdi/foo 2.00MiB 0.00B - /mnt/sdi/bar 2.00MiB 0.00B - /mnt/sdi/baz 6.00MiB 1.00MiB 2.00MiB /mnt/sdi Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap' At subvol /mnt/sdi/snap At subvol snap *** fiemap in the new filesystem *** /mnt/sdi/snap/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/snap/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 32768..34815 2048 0x1 Total Exclusive Set shared Filename 2.00MiB 0.00B - /mnt/sdi/snap/foo 2.00MiB 1.00MiB - /mnt/sdi/snap/bar 2.00MiB 1.00MiB - /mnt/sdi/snap/baz 6.00MiB 2.00MiB - /mnt/sdi/snap 6.00MiB 2.00MiB 2.00MiB /mnt/sdi We end up with two 1M extents that are not shared for files bar and baz. After this change: $ ./test.sh (...) *** fiemap in the original filesystem *** /mnt/sdi/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x1 /mnt/sdi/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 Total Exclusive Set shared Filename 2.00MiB 1.00MiB - /mnt/sdi/foo 2.00MiB 0.00B - /mnt/sdi/bar 2.00MiB 0.00B - /mnt/sdi/baz 6.00MiB 1.00MiB 2.00MiB /mnt/sdi Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap' At subvol /mnt/sdi/snap At subvol snap *** fiemap in the new filesystem *** /mnt/sdi/snap/foo: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..4095]: 26624..30719 4096 0x2001 /mnt/sdi/snap/bar: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x2001 /mnt/sdi/snap/baz: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..2047]: 26624..28671 2048 0x2000 1: [2048..4095]: 30720..32767 2048 0x2001 Total Exclusive Set shared Filename 2.00MiB 0.00B - /mnt/sdi/snap/foo 2.00MiB 0.00B - /mnt/sdi/snap/bar 2.00MiB 0.00B - /mnt/sdi/snap/baz 6.00MiB 0.00B - /mnt/sdi/snap 6.00MiB 0.00B 3.00MiB /mnt/sdi Now there's a much better sharing, files bar and baz share 1M of the extent of file foo and the second extent of files bar and baz is shared between themselves. This will later be turned into a test case for fstests. Link: https://lore.kernel.org/linux-btrfs/20221008005704.795b44b0@crass-HP-ZBook-15-G2/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 9 +++++---- fs/btrfs/backref.h | 4 ++-- fs/btrfs/scrub.c | 4 ++-- fs/btrfs/send.c | 49 +++++++++++++++++++++++++++++++--------------- 4 files changed, 42 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 04cb608e7cfb..9be11a342de5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -27,6 +27,7 @@ struct extent_inode_elem { u64 inum; u64 offset; + u64 num_bytes; struct extent_inode_elem *next; }; @@ -37,6 +38,7 @@ static int check_extent_in_eb(const struct btrfs_key *key, struct extent_inode_elem **eie, bool ignore_offset) { + const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); u64 offset = 0; struct extent_inode_elem *e; @@ -45,10 +47,8 @@ static int check_extent_in_eb(const struct btrfs_key *key, !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; - u64 data_len; data_offset = btrfs_file_extent_offset(eb, fi); - data_len = btrfs_file_extent_num_bytes(eb, fi); if (extent_item_pos < data_offset || extent_item_pos >= data_offset + data_len) @@ -63,6 +63,7 @@ static int check_extent_in_eb(const struct btrfs_key *key, e->next = *eie; e->inum = key->objectid; e->offset = key->offset + offset; + e->num_bytes = data_len; *eie = e; return 0; @@ -2265,7 +2266,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, "ref for %llu resolved, key (%llu EXTEND_DATA %llu), root %llu", extent_item_objectid, eie->inum, eie->offset, root); - ret = iterate(eie->inum, eie->offset, root, ctx); + ret = iterate(eie->inum, eie->offset, eie->num_bytes, root, ctx); if (ret) { btrfs_debug(fs_info, "stopping iteration for %llu due to ret=%d", @@ -2357,7 +2358,7 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +static int build_ino_list(u64 inum, u64 offset, u64 num_bytes, u64 root, void *ctx) { struct btrfs_data_container *inodes = ctx; const size_t c = 3 * sizeof(u64); diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2dd68f87dca5..8d3598155f3b 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -75,8 +75,8 @@ struct btrfs_backref_share_check_ctx { int prev_extents_cache_slot; }; -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root, - void *ctx); +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8c89e63ef2e8..3c22573bfe0d 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -809,8 +809,8 @@ nomem: return ERR_PTR(-ENOMEM); } -static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, - void *warn_ctx) +static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *warn_ctx) { u32 nlink; int ret; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9e01b66b32b..49759cd9eecb 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -76,7 +76,7 @@ struct clone_root { struct btrfs_root *root; u64 ino; u64 offset; - + u64 num_bytes; u64 found_refs; }; @@ -1273,7 +1273,8 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) * Called for every backref that is found for the current extent. * Results are collected in sctx->clone_roots->ino/offset/found_refs */ -static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) +static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, + void *ctx_) { struct backref_ctx *bctx = ctx_; struct clone_root *found; @@ -1318,15 +1319,17 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) bctx->found++; found->found_refs++; - if (ino < found->ino) { + + /* + * If the given backref refers to a file extent item with a larger + * number of bytes than what we found before, use the new one so that + * we clone more optimally and end up doing less writes and getting + * less exclusive, non-shared extents at the destination. + */ + if (num_bytes > found->num_bytes) { found->ino = ino; found->offset = offset; - } else if (found->ino == ino) { - /* - * same extent found more then once in the same file. - */ - if (found->offset > offset + bctx->extent_len) - found->offset = offset; + found->num_bytes = num_bytes; } return 0; @@ -1437,6 +1440,7 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root = sctx->clone_roots + i; cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; + cur_clone_root->num_bytes = 0; cur_clone_root->found_refs = 0; } @@ -1506,14 +1510,27 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { - if (sctx->clone_roots[i].found_refs) { - if (!cur_clone_root) - cur_clone_root = sctx->clone_roots + i; - else if (sctx->clone_roots[i].root == sctx->send_root) - /* prefer clones from send_root over others */ - cur_clone_root = sctx->clone_roots + i; - } + struct clone_root *clone_root = &sctx->clone_roots[i]; + if (clone_root->found_refs == 0) + continue; + + /* + * Choose the root from which we can clone more bytes, to + * minimize write operations and therefore have more extent + * sharing at the destination (the same as in the source). + */ + if (!cur_clone_root || + clone_root->num_bytes > cur_clone_root->num_bytes) { + cur_clone_root = clone_root; + + /* + * We found an optimal clone candidate (any inode from + * any root is fine), so we're done. + */ + if (clone_root->num_bytes >= backref_ctx.extent_len) + break; + } } if (cur_clone_root) { From 6ce6ba534418132f4c727d5707fe2794c797299c Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:46 +0000 Subject: [PATCH 151/249] btrfs: use a single argument for extent offset in backref walking functions The interface for find_parent_nodes() has two extent offset related arguments: 1) One u64 pointer argument for the extent offset; 2) One boolean argument to tell if the extent offset should be ignored or not. These are confusing, becase the extent offset pointer can be NULL and in some cases callers pass a NULL value as a way to tell the backref walking code to ignore offsets in file extent items (and simply consider all file extent items that point to the target data extent). The boolean argument was added in commit c995ab3cda3f ("btrfs: add a flag to iterate_inodes_from_logical to find all extent refs for uncompressed extents"), but it was never really necessary, it was enough if it could find a way to get a NULL value passed to the "extent_item_pos" argument of find_parent_nodes(). The arguments are also passed to functions called by find_parent_nodes() and respective helper functions, which further makes everything more complicated than needed. Then we have several backref walking related functions that end up calling find_parent_nodes(), either directly or through some other function that they call, and for many we have to use an "extent_item_pos" (u64) argument and a boolean "ignore_offset" argument too. This is confusing and not really necessary. So use a single argument to specify the extent offset, as a simple u64 and not as a pointer, but using a special value of (u64)-1, defined as a documented constant, to indicate when the extent offset should be ignored. This is also preparation work for the upcoming patches in the series that add other arguments to find_parent_nodes() and other related functions that use it. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 87 +++++++++++++++++++++---------------------- fs/btrfs/backref.h | 12 ++++-- fs/btrfs/relocation.c | 2 +- fs/btrfs/scrub.c | 2 +- fs/btrfs/send.c | 2 +- 5 files changed, 54 insertions(+), 51 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9be11a342de5..432064ee788e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -35,15 +35,13 @@ static int check_extent_in_eb(const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) + struct extent_inode_elem **eie) { const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); u64 offset = 0; struct extent_inode_elem *e; - if (!ignore_offset && - !btrfs_file_extent_compression(eb, fi) && + if (!btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && !btrfs_file_extent_other_encoding(eb, fi)) { u64 data_offset; @@ -81,8 +79,7 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) static int find_extent_in_eb(const struct extent_buffer *eb, u64 wanted_disk_byte, u64 extent_item_pos, - struct extent_inode_elem **eie, - bool ignore_offset) + struct extent_inode_elem **eie) { u64 disk_byte; struct btrfs_key key; @@ -111,7 +108,7 @@ static int find_extent_in_eb(const struct extent_buffer *eb, if (disk_byte != wanted_disk_byte) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie, ignore_offset); + ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); if (ret < 0) return ret; } @@ -450,9 +447,9 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, - int level, u64 time_seq, const u64 *extent_item_pos, - bool ignore_offset) + int level, u64 time_seq, u64 extent_item_pos) { + const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); int ret = 0; int slot; struct extent_buffer *eb; @@ -528,10 +525,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, count++; else goto next; - if (extent_item_pos) { + if (!ignore_offset) { ret = check_extent_in_eb(&key, eb, fi, - *extent_item_pos, - &eie, ignore_offset); + extent_item_pos, &eie); if (ret < 0) break; } @@ -541,7 +537,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && extent_item_pos) { + if (!ret && !ignore_offset) { while (old->next) old = old->next; old->next = eie; @@ -570,7 +566,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, struct prelim_ref *ref, struct ulist *parents, - const u64 *extent_item_pos, bool ignore_offset) + u64 extent_item_pos) { struct btrfs_root *root; struct extent_buffer *eb; @@ -664,7 +660,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, } ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos, ignore_offset); + time_seq, extent_item_pos); out: btrfs_put_root(root); out_free: @@ -712,8 +708,8 @@ static void free_leaf_list(struct ulist *ulist) static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct preftrees *preftrees, - const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) + u64 extent_item_pos, + struct share_check *sc) { int err; int ret = 0; @@ -756,8 +752,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, goto out; } err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, - ref, parents, extent_item_pos, - ignore_offset); + ref, parents, extent_item_pos); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. @@ -1340,18 +1335,21 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx * * Otherwise this returns 0 for success and <0 for an error. * - * If ignore_offset is set to false, only extent refs whose offsets match - * extent_item_pos are returned. If true, every extent ref is returned - * and extent_item_pos is ignored. + * @extent_item_pos is meaningful only if we are dealing with a data extent. + * If its value is not BTRFS_IGNORE_EXTENT_OFFSET, then only collect references + * from file extent items that refer to a section of the data extent that + * contains @extent_item_pos. If its value is BTRFS_IGNORE_EXTENT_OFFSET then + * collect references for every file extent item that points to the data extent. * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, - struct ulist *roots, const u64 *extent_item_pos, - struct share_check *sc, bool ignore_offset) + struct ulist *roots, u64 extent_item_pos, + struct share_check *sc) { + const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); struct btrfs_key key; struct btrfs_path *path; @@ -1539,7 +1537,7 @@ again: WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, sc, ignore_offset); + extent_item_pos, sc); if (ret) goto out; @@ -1573,8 +1571,7 @@ again: goto out; } if (ref->count && ref->parent) { - if (extent_item_pos && !ref->inode_list && - ref->level == 0) { + if (!ignore_offset && !ref->inode_list && ref->level == 0) { struct extent_buffer *eb; eb = read_tree_block(fs_info, ref->parent, 0, @@ -1592,7 +1589,7 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); ret = find_extent_in_eb(eb, bytenr, - *extent_item_pos, &eie, ignore_offset); + extent_item_pos, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); @@ -1611,7 +1608,7 @@ again: (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && extent_item_pos) { + if (!ret && !ignore_offset) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1665,7 +1662,7 @@ out: int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset) + u64 extent_item_pos) { int ret; @@ -1674,7 +1671,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL, ignore_offset); + *leafs, NULL, extent_item_pos, NULL); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1698,8 +1695,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, */ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, - bool ignore_offset) + u64 time_seq, struct ulist **roots) { struct ulist *tmp; struct ulist_node *node = NULL; @@ -1718,7 +1714,8 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, NULL, NULL, ignore_offset); + tmp, *roots, BTRFS_IGNORE_EXTENT_OFFSET, + NULL); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1745,8 +1742,7 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, if (!trans && !skip_commit_root_sem) down_read(&fs_info->commit_root_sem); - ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, - time_seq, roots, false); + ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, time_seq, roots); if (!trans && !skip_commit_root_sem) up_read(&fs_info->commit_root_sem); return ret; @@ -1845,7 +1841,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool cached; ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, - NULL, NULL, &shared, false); + NULL, BTRFS_IGNORE_EXTENT_OFFSET, &shared); if (ret == BACKREF_FOUND_SHARED || ret == BACKREF_FOUND_NOT_SHARED) { /* If shared must return 1, otherwise return 0. */ @@ -2286,8 +2282,7 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_item_pos, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + iterate_extent_inodes_t *iterate, void *ctx) { int ret; struct btrfs_trans_handle *trans = NULL; @@ -2318,16 +2313,14 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, - &extent_item_pos, ignore_offset); + seq_elem.seq, &refs, extent_item_pos); if (ret) goto out; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - seq_elem.seq, &roots, - ignore_offset); + seq_elem.seq, &roots); if (ret) break; ULIST_ITER_INIT(&root_uiter); @@ -2395,10 +2388,14 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; - extent_item_pos = logical - found_key.objectid; + if (ignore_offset) + extent_item_pos = BTRFS_IGNORE_EXTENT_OFFSET; + else + extent_item_pos = logical - found_key.objectid; + ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - build_ino_list, ctx, ignore_offset); + build_ino_list, ctx); return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 8d3598155f3b..2eb99f23cc8f 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -12,6 +12,13 @@ #include "disk-io.h" #include "extent_io.h" +/* + * Pass to backref walking functions to tell them to include references from + * all file extent items that point to the target data extent, regardless if + * they refer to the whole extent or just sections of it (bookend extents). + */ +#define BTRFS_IGNORE_EXTENT_OFFSET ((u64)-1) + struct inode_fs_paths { struct btrfs_path *btrfs_path; struct btrfs_root *fs_root; @@ -92,8 +99,7 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, int iterate_extent_inodes(struct btrfs_fs_info *fs_info, u64 extent_item_objectid, u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset); + iterate_extent_inodes_t *iterate, void *ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, @@ -104,7 +110,7 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **leafs, - const u64 *extent_item_pos, bool ignore_offset); + u64 extent_item_pos); int btrfs_find_all_roots(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist **roots, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d119986d1599..45690f7b5900 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3417,7 +3417,7 @@ int add_data_references(struct reloc_control *rc, btrfs_release_path(path); ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, - 0, &leaves, NULL, true); + 0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET); if (ret < 0) return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3c22573bfe0d..6c7dc89709fc 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -969,7 +969,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) swarn.dev = dev; iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, - scrub_print_warning_inode, &swarn, false); + scrub_print_warning_inode, &swarn); } out: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 49759cd9eecb..6bf06939f891 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1467,7 +1467,7 @@ static int find_extent_clone(struct send_ctx *sctx, extent_item_pos = 0; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, 1, __iterate_backrefs, - &backref_ctx, false); + &backref_ctx); if (ret < 0) goto out; From a2c8d27e5ee810b7149b42b88ddf7298e5b8dfe0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:47 +0000 Subject: [PATCH 152/249] btrfs: use a structure to pass arguments to backref walking functions The public backref walking functions have quite a lot of arguments that are passed down the call stack to find_parent_nodes(), the core function of the backref walking code. The next patches in series will need to add even arguments to these functions that should be passed not only to find_parent_nodes(), but also to other functions used by the later (directly or even lower in the call stack). So create a structure to hold all these arguments and state used by the main backref walking function, find_parent_nodes(), and use it as the argument for the public backref walking functions iterate_extent_inodes(), btrfs_find_all_leafs() and btrfs_find_all_roots(). Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 343 +++++++++++++++++----------------- fs/btrfs/backref.h | 76 ++++++-- fs/btrfs/qgroup.c | 38 ++-- fs/btrfs/relocation.c | 19 +- fs/btrfs/scrub.c | 14 +- fs/btrfs/send.c | 15 +- fs/btrfs/tests/qgroup-tests.c | 50 ++++- 7 files changed, 328 insertions(+), 227 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 432064ee788e..a1a00a7bdba5 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -444,12 +444,12 @@ static int is_shared_data_backref(struct preftrees *preftrees, u64 bytenr) return 0; } -static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, +static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *root, struct btrfs_path *path, struct ulist *parents, struct preftrees *preftrees, struct prelim_ref *ref, - int level, u64 time_seq, u64 extent_item_pos) + int level) { - const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); int ret = 0; int slot; struct extent_buffer *eb; @@ -484,10 +484,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); } while (!ret && count < ref->count) { @@ -508,10 +508,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else - ret = btrfs_next_old_leaf(root, path, time_seq); + ret = btrfs_next_old_leaf(root, path, ctx->time_seq); continue; } fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); @@ -525,9 +525,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, count++; else goto next; - if (!ignore_offset) { + if (!ctx->ignore_extent_item_pos) { ret = check_extent_in_eb(&key, eb, fi, - extent_item_pos, &eie); + ctx->extent_item_pos, &eie); if (ret < 0) break; } @@ -537,7 +537,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie, (void **)&old, GFP_NOFS); if (ret < 0) break; - if (!ret && !ignore_offset) { + if (!ret && !ctx->ignore_extent_item_pos) { while (old->next) old = old->next; old->next = eie; @@ -545,10 +545,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else - ret = btrfs_next_old_item(root, path, time_seq); + ret = btrfs_next_old_item(root, path, ctx->time_seq); } if (ret > 0) @@ -562,11 +562,10 @@ next: * resolve an indirect backref in the form (root_id, key, level) * to a logical address */ -static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_ref(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - struct prelim_ref *ref, struct ulist *parents, - u64 extent_item_pos) + struct prelim_ref *ref, struct ulist *parents) { struct btrfs_root *root; struct extent_buffer *eb; @@ -584,9 +583,9 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, * here. */ if (path->search_commit_root) - root = btrfs_get_fs_root_commit_root(fs_info, path, ref->root_id); + root = btrfs_get_fs_root_commit_root(ctx->fs_info, path, ref->root_id); else - root = btrfs_get_fs_root(fs_info, ref->root_id, false); + root = btrfs_get_fs_root(ctx->fs_info, ref->root_id, false); if (IS_ERR(root)) { ret = PTR_ERR(root); goto out_free; @@ -598,17 +597,17 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, goto out; } - if (btrfs_is_testing(fs_info)) { + if (btrfs_is_testing(ctx->fs_info)) { ret = -ENOENT; goto out; } if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == BTRFS_SEQ_LAST) + else if (ctx->time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else - root_level = btrfs_old_root_level(root, time_seq); + root_level = btrfs_old_root_level(root, ctx->time_seq); if (root_level + 1 == level) goto out; @@ -636,12 +635,12 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else - ret = btrfs_search_old_slot(root, &search_key, path, time_seq); + ret = btrfs_search_old_slot(root, &search_key, path, ctx->time_seq); - btrfs_debug(fs_info, + btrfs_debug(ctx->fs_info, "search slot in root %llu (level %d, ref count %d) returned %d for key (%llu %u %llu)", ref->root_id, level, ref->count, ret, ref->key_for_search.objectid, ref->key_for_search.type, @@ -659,8 +658,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, eb = path->nodes[level]; } - ret = add_all_parents(root, path, parents, preftrees, ref, level, - time_seq, extent_item_pos); + ret = add_all_parents(ctx, root, path, parents, preftrees, ref, level); out: btrfs_put_root(root); out_free: @@ -705,10 +703,9 @@ static void free_leaf_list(struct ulist *ulist) * rbtree as they are encountered. The new backrefs are subsequently * resolved as above. */ -static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 time_seq, +static int resolve_indirect_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, struct preftrees *preftrees, - u64 extent_item_pos, struct share_check *sc) { int err; @@ -751,14 +748,13 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, ret = BACKREF_FOUND_SHARED; goto out; } - err = resolve_indirect_ref(fs_info, path, time_seq, preftrees, - ref, parents, extent_item_pos); + err = resolve_indirect_ref(ctx, path, preftrees, ref, parents); /* * we can only tolerate ENOENT,otherwise,we should catch error * and return directly. */ if (err == -ENOENT) { - prelim_ref_insert(fs_info, &preftrees->direct, ref, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); continue; } else if (err) { @@ -787,7 +783,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, memcpy(new_ref, ref, sizeof(*ref)); new_ref->parent = node->val; new_ref->inode_list = unode_aux_to_inode_list(node); - prelim_ref_insert(fs_info, &preftrees->direct, + prelim_ref_insert(ctx->fs_info, &preftrees->direct, new_ref, NULL); } @@ -795,7 +791,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info, * Now it's a direct ref, put it in the direct tree. We must * do this last because the ref could be merged/freed here. */ - prelim_ref_insert(fs_info, &preftrees->direct, ref, NULL); + prelim_ref_insert(ctx->fs_info, &preftrees->direct, ref, NULL); ulist_reinit(parents); cond_resched(); @@ -1325,32 +1321,18 @@ static void store_backref_shared_cache(struct btrfs_backref_share_check_ctx *ctx * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and - * behave much like trans == NULL case, the difference only lies in it will not - * commit root. - * The special case is for qgroup to search roots in commit_transaction(). - * - * @sc - if !NULL, then immediately return BACKREF_FOUND_SHARED when a - * shared extent is detected. + * @ctx: Backref walking context object, must be not NULL. + * @sc: If !NULL, then immediately return BACKREF_FOUND_SHARED when a + * shared extent is detected. * * Otherwise this returns 0 for success and <0 for an error. * - * @extent_item_pos is meaningful only if we are dealing with a data extent. - * If its value is not BTRFS_IGNORE_EXTENT_OFFSET, then only collect references - * from file extent items that refer to a section of the data extent that - * contains @extent_item_pos. If its value is BTRFS_IGNORE_EXTENT_OFFSET then - * collect references for every file extent item that points to the data extent. - * * FIXME some caching might speed things up */ -static int find_parent_nodes(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist *refs, - struct ulist *roots, u64 extent_item_pos, +static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, struct share_check *sc) { - const bool ignore_offset = (extent_item_pos == BTRFS_IGNORE_EXTENT_OFFSET); - struct btrfs_root *root = btrfs_extent_root(fs_info, bytenr); + struct btrfs_root *root = btrfs_extent_root(ctx->fs_info, ctx->bytenr); struct btrfs_key key; struct btrfs_path *path; struct btrfs_delayed_ref_root *delayed_refs = NULL; @@ -1368,11 +1350,11 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, /* Roots ulist is not needed when using a sharedness check context. */ if (sc) - ASSERT(roots == NULL); + ASSERT(ctx->roots == NULL); - key.objectid = bytenr; + key.objectid = ctx->bytenr; key.offset = (u64)-1; - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; @@ -1380,12 +1362,12 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - if (!trans) { + if (!ctx->trans) { path->search_commit_root = 1; path->skip_locking = 1; } - if (time_seq == BTRFS_SEQ_LAST) + if (ctx->time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; again: @@ -1401,17 +1383,17 @@ again: goto out; } - if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != BTRFS_SEQ_LAST) { + if (ctx->trans && likely(ctx->trans->type != __TRANS_DUMMY) && + ctx->time_seq != BTRFS_SEQ_LAST) { /* * We have a specific time_seq we care about and trans which * means we have the path lock, we need to grab the ref head and * lock it so we have a consistent view of the refs at the given * time. */ - delayed_refs = &trans->transaction->delayed_refs; + delayed_refs = &ctx->trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); + head = btrfs_find_delayed_ref_head(delayed_refs, ctx->bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { refcount_inc(&head->refs); @@ -1429,7 +1411,7 @@ again: goto again; } spin_unlock(&delayed_refs->lock); - ret = add_delayed_refs(fs_info, head, time_seq, + ret = add_delayed_refs(ctx->fs_info, head, ctx->time_seq, &preftrees, sc); mutex_unlock(&head->mutex); if (ret) @@ -1447,14 +1429,14 @@ again: leaf = path->nodes[0]; slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid == bytenr && + if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(fs_info, path, bytenr, + ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr, &info_level, &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(root, path, bytenr, info_level, + ret = add_keyed_refs(root, path, ctx->bytenr, info_level, &preftrees, sc); if (ret) goto out; @@ -1483,7 +1465,7 @@ again: * extent item pointing to the data extent) is shared, that is, if any * of the extent buffers in the path is referenced by other trees. */ - if (sc && bytenr == sc->data_bytenr) { + if (sc && ctx->bytenr == sc->data_bytenr) { /* * If our data extent is from a generation more recent than the * last generation used to snapshot the root, then we know that @@ -1530,14 +1512,13 @@ again: btrfs_release_path(path); - ret = add_missing_keys(fs_info, &preftrees, path->skip_locking == 0); + ret = add_missing_keys(ctx->fs_info, &preftrees, path->skip_locking == 0); if (ret) goto out; WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root)); - ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees, - extent_item_pos, sc); + ret = resolve_indirect_refs(ctx, path, &preftrees, sc); if (ret) goto out; @@ -1564,17 +1545,18 @@ again: * e.g. different offsets would not be merged, * and would retain their original ref->count < 0. */ - if (roots && ref->count && ref->root_id && ref->parent == 0) { + if (ctx->roots && ref->count && ref->root_id && ref->parent == 0) { /* no parent == root of tree */ - ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); + ret = ulist_add(ctx->roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) goto out; } if (ref->count && ref->parent) { - if (!ignore_offset && !ref->inode_list && ref->level == 0) { + if (!ctx->ignore_extent_item_pos && !ref->inode_list && + ref->level == 0) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref->parent, 0, + eb = read_tree_block(ctx->fs_info, ref->parent, 0, 0, ref->level, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); @@ -1588,8 +1570,8 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); - ret = find_extent_in_eb(eb, bytenr, - extent_item_pos, &eie); + ret = find_extent_in_eb(eb, ctx->bytenr, + ctx->extent_item_pos, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); @@ -1603,12 +1585,12 @@ again: */ eie = NULL; } - ret = ulist_add_merge_ptr(refs, ref->parent, + ret = ulist_add_merge_ptr(ctx->refs, ref->parent, ref->inode_list, (void **)&eie, GFP_NOFS); if (ret < 0) goto out; - if (!ret && !ignore_offset) { + if (!ret && !ctx->ignore_extent_item_pos) { /* * We've recorded that parent, so we must extend * its inode list here. @@ -1652,28 +1634,29 @@ out: } /* - * Finds all leafs with a reference to the specified combination of bytenr and - * offset. key_list_head will point to a list of corresponding keys (caller must - * free each list element). The leafs will be stored in the leafs ulist, which - * must be freed with ulist_free. + * Finds all leaves with a reference to the specified combination of + * @ctx->bytenr and @ctx->extent_item_pos. The bytenr of the found leaves are + * added to the ulist at @ctx->refs, and that ulist is allocated by this + * function. The caller should free the ulist with free_leaf_list() if + * @ctx->ignore_extent_item_pos is false, otherwise a fimple ulist_free() is + * enough. * - * returns 0 on success, <0 on error + * Returns 0 on success and < 0 on error. On error @ctx->refs is not allocated. */ -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - u64 extent_item_pos) +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) { int ret; - *leafs = ulist_alloc(GFP_NOFS); - if (!*leafs) + ASSERT(ctx->refs == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - *leafs, NULL, extent_item_pos, NULL); + ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - free_leaf_list(*leafs); + free_leaf_list(ctx->refs); + ctx->refs = NULL; return ret; } @@ -1681,7 +1664,7 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, } /* - * walk all backrefs for a given extent to find all roots that reference this + * Walk all backrefs for a given extent to find all roots that reference this * extent. Walking a backref means finding all extents that reference this * extent and in turn walk the backrefs of those, too. Naturally this is a * recursive process, but here it is implemented in an iterative fashion: We @@ -1689,62 +1672,74 @@ int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, * list. In turn, we find all referencing extents for those, further appending * to the list. The way we iterate the list allows adding more elements after * the current while iterating. The process stops when we reach the end of the - * list. Found roots are added to the roots list. + * list. * - * returns 0 on success, < 0 on error. + * Found roots are added to @ctx->roots, which is allocated by this function and + * @ctx->roots should be NULL when calling this function. This function also + * requires @ctx->refs to be NULL, as it uses it for allocating a ulist to do + * temporary work, and frees it before returning. + * + * Returns 0 on success, < 0 on error. On error @ctx->roots is always NULL. */ -static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots) +static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { - struct ulist *tmp; - struct ulist_node *node = NULL; + const u64 orig_bytenr = ctx->bytenr; + const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; struct ulist_iterator uiter; - int ret; + int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) + ASSERT(ctx->refs == NULL); + ASSERT(ctx->roots == NULL); + + ctx->refs = ulist_alloc(GFP_NOFS); + if (!ctx->refs) return -ENOMEM; - *roots = ulist_alloc(GFP_NOFS); - if (!*roots) { - ulist_free(tmp); + + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; return -ENOMEM; } + ctx->ignore_extent_item_pos = true; + ULIST_ITER_INIT(&uiter); while (1) { - ret = find_parent_nodes(trans, fs_info, bytenr, time_seq, - tmp, *roots, BTRFS_IGNORE_EXTENT_OFFSET, - NULL); + struct ulist_node *node; + + ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - ulist_free(tmp); - ulist_free(*roots); - *roots = NULL; - return ret; + ulist_free(ctx->roots); + ctx->roots = NULL; + break; } - node = ulist_next(tmp, &uiter); + ret = 0; + node = ulist_next(ctx->refs, &uiter); if (!node) break; - bytenr = node->val; + ctx->bytenr = node->val; cond_resched(); } - ulist_free(tmp); - return 0; + ulist_free(ctx->refs); + ctx->refs = NULL; + ctx->bytenr = orig_bytenr; + ctx->ignore_extent_item_pos = orig_ignore_extent_item_pos; + + return ret; } -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem) { int ret; - if (!trans && !skip_commit_root_sem) - down_read(&fs_info->commit_root_sem); - ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, time_seq, roots); - if (!trans && !skip_commit_root_sem) - up_read(&fs_info->commit_root_sem); + if (!ctx->trans && !skip_commit_root_sem) + down_read(&ctx->fs_info->commit_root_sem); + ret = btrfs_find_all_roots_safe(ctx); + if (!ctx->trans && !skip_commit_root_sem) + up_read(&ctx->fs_info->commit_root_sem); return ret; } @@ -1794,6 +1789,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, u64 extent_gen, struct btrfs_backref_share_check_ctx *ctx) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; @@ -1830,8 +1826,14 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, down_read(&fs_info->commit_root_sem); } else { btrfs_get_tree_mod_seq(fs_info, &elem); + walk_ctx.time_seq = elem.seq; } + walk_ctx.ignore_extent_item_pos = true; + walk_ctx.trans = trans; + walk_ctx.fs_info = fs_info; + walk_ctx.refs = &ctx->refs; + /* -1 means we are in the bytenr of the data extent. */ level = -1; ULIST_ITER_INIT(&uiter); @@ -1840,8 +1842,8 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr, bool is_shared; bool cached; - ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, &ctx->refs, - NULL, BTRFS_IGNORE_EXTENT_OFFSET, &shared); + walk_ctx.bytenr = bytenr; + ret = find_parent_nodes(&walk_ctx, &shared); if (ret == BACKREF_FOUND_SHARED || ret == BACKREF_FOUND_NOT_SHARED) { /* If shared must return 1, otherwise return 0. */ @@ -2279,73 +2281,81 @@ static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, * the given parameters. * when the iterator function returns a non-zero value, iteration stops. */ -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, u64 extent_item_pos, - int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx) +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx) { int ret; - struct btrfs_trans_handle *trans = NULL; - struct ulist *refs = NULL; - struct ulist *roots = NULL; - struct ulist_node *ref_node = NULL; - struct ulist_node *root_node = NULL; + struct ulist *refs; + struct ulist_node *ref_node; struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; - struct ulist_iterator root_uiter; - btrfs_debug(fs_info, "resolving all inodes for extent %llu", - extent_item_objectid); + btrfs_debug(ctx->fs_info, "resolving all inodes for extent %llu", + ctx->bytenr); + + ASSERT(ctx->trans == NULL); if (!search_commit_root) { - trans = btrfs_attach_transaction(fs_info->tree_root); + struct btrfs_trans_handle *trans; + + trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); trans = NULL; } + ctx->trans = trans; } - if (trans) - btrfs_get_tree_mod_seq(fs_info, &seq_elem); - else - down_read(&fs_info->commit_root_sem); + if (ctx->trans) { + btrfs_get_tree_mod_seq(ctx->fs_info, &seq_elem); + ctx->time_seq = seq_elem.seq; + } else { + down_read(&ctx->fs_info->commit_root_sem); + } - ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - seq_elem.seq, &refs, extent_item_pos); + ret = btrfs_find_all_leafs(ctx); if (ret) goto out; + refs = ctx->refs; + ctx->refs = NULL; ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { - ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - seq_elem.seq, &roots); + struct ulist_node *root_node; + struct ulist_iterator root_uiter; + + ctx->bytenr = ref_node->val; + ret = btrfs_find_all_roots_safe(ctx); if (ret) break; + ULIST_ITER_INIT(&root_uiter); - while (!ret && (root_node = ulist_next(roots, &root_uiter))) { - btrfs_debug(fs_info, + while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { + btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); - ret = iterate_leaf_refs(fs_info, + ret = iterate_leaf_refs(ctx->fs_info, (struct extent_inode_elem *) (uintptr_t)ref_node->aux, - root_node->val, - extent_item_objectid, - iterate, ctx); + root_node->val, ctx->bytenr, + iterate, user_ctx); } - ulist_free(roots); + ulist_free(ctx->roots); + ctx->roots = NULL; } free_leaf_list(refs); out: - if (trans) { - btrfs_put_tree_mod_seq(fs_info, &seq_elem); - btrfs_end_transaction(trans); + if (ctx->trans) { + btrfs_put_tree_mod_seq(ctx->fs_info, &seq_elem); + btrfs_end_transaction(ctx->trans); + ctx->trans = NULL; } else { - up_read(&fs_info->commit_root_sem); + up_read(&ctx->fs_info->commit_root_sem); } return ret; @@ -2375,8 +2385,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, bool ignore_offset) { + struct btrfs_backref_walk_ctx walk_ctx = { 0 }; int ret; - u64 extent_item_pos; u64 flags = 0; struct btrfs_key found_key; int search_commit_root = path->search_commit_root; @@ -2388,16 +2398,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) return -EINVAL; + walk_ctx.bytenr = found_key.objectid; if (ignore_offset) - extent_item_pos = BTRFS_IGNORE_EXTENT_OFFSET; + walk_ctx.ignore_extent_item_pos = true; else - extent_item_pos = logical - found_key.objectid; + walk_ctx.extent_item_pos = logical - found_key.objectid; + walk_ctx.fs_info = fs_info; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, search_commit_root, - build_ino_list, ctx); - - return ret; + return iterate_extent_inodes(&walk_ctx, search_commit_root, + build_ino_list, ctx); } static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 2eb99f23cc8f..ce700dfcc016 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -13,11 +13,63 @@ #include "extent_io.h" /* - * Pass to backref walking functions to tell them to include references from - * all file extent items that point to the target data extent, regardless if - * they refer to the whole extent or just sections of it (bookend extents). + * Context and arguments for backref walking functions. Some of the fields are + * to be filled by the caller of such functions while other are filled by the + * functions themselves, as described below. */ -#define BTRFS_IGNORE_EXTENT_OFFSET ((u64)-1) +struct btrfs_backref_walk_ctx { + /* + * The address of the extent for which we are doing backref walking. + * Can be either a data extent or a metadata extent. + * + * Must always be set by the top level caller. + */ + u64 bytenr; + /* + * Offset relative to the target extent. This is only used for data + * extents, and it's meaningful because we can have file extent items + * that point only to a section of a data extent ("bookend" extents), + * and we want to filter out any that don't point to a section of the + * data extent containing the given offset. + * + * Must always be set by the top level caller. + */ + u64 extent_item_pos; + /* + * If true and bytenr corresponds to a data extent, then references from + * all file extent items that point to the data extent are considered, + * @extent_item_pos is ignored. + */ + bool ignore_extent_item_pos; + /* A valid transaction handle or NULL. */ + struct btrfs_trans_handle *trans; + /* + * The file system's info object, can not be NULL. + * + * Must always be set by the top level caller. + */ + struct btrfs_fs_info *fs_info; + /* + * Time sequence acquired from btrfs_get_tree_mod_seq(), in case the + * caller joined the tree mod log to get a consistent view of b+trees + * while we do backref walking, or BTRFS_SEQ_LAST. + * When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses + * commit roots when searching b+trees - this is a special case for + * qgroups used during a transaction commit. + */ + u64 time_seq; + /* + * Used to collect the bytenr of metadata extents that point to the + * target extent. + */ + struct ulist *refs; + /* + * List used to collect the IDs of the roots from which the target + * extent is accessible. Can be NULL in case the caller does not care + * about collecting root IDs. + */ + struct ulist *roots; +}; struct inode_fs_paths { struct btrfs_path *btrfs_path; @@ -96,10 +148,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb, struct btrfs_key *key, struct btrfs_extent_item *ei, u32 item_size, u64 *out_root, u8 *out_level); -int iterate_extent_inodes(struct btrfs_fs_info *fs_info, - u64 extent_item_objectid, - u64 extent_offset, int search_commit_root, - iterate_extent_inodes_t *iterate, void *ctx); +int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, + bool search_commit_root, + iterate_extent_inodes_t *iterate, void *user_ctx); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, void *ctx, @@ -107,13 +158,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); -int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **leafs, - u64 extent_item_pos); -int btrfs_find_all_roots(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 bytenr, - u64 time_seq, struct ulist **roots, +int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx); +int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx, bool skip_commit_root_sem); char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, u32 name_len, unsigned long name_off, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 75dd964ca46c..24c013c61a94 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1794,8 +1794,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, struct btrfs_qgroup_extent_record *qrecord) { - struct ulist *old_root; - u64 bytenr = qrecord->bytenr; + struct btrfs_backref_walk_ctx ctx = { 0 }; int ret; /* @@ -1822,8 +1821,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING) return 0; - ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, - true); + ctx.bytenr = qrecord->bytenr; + ctx.fs_info = trans->fs_info; + + ret = btrfs_find_all_roots(&ctx, true); if (ret < 0) { qgroup_mark_inconsistent(trans->fs_info); btrfs_warn(trans->fs_info, @@ -1839,7 +1840,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, * * So modifying qrecord->old_roots is safe here */ - qrecord->old_roots = old_root; + qrecord->old_roots = ctx.roots; return 0; } @@ -2750,17 +2751,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) if (!ret && !(fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + + ctx.bytenr = record->bytenr; + ctx.fs_info = fs_info; + /* * Old roots should be searched when inserting qgroup * extent record */ if (WARN_ON(!record->old_roots)) { /* Search commit root to find old_roots */ - ret = btrfs_find_all_roots(NULL, fs_info, - record->bytenr, 0, - &record->old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + record->old_roots = ctx.roots; + ctx.roots = NULL; } /* Free the reserved data space */ @@ -2773,10 +2779,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) * which doesn't lock tree or delayed_refs and search * current root. It's safe inside commit_transaction(). */ - ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); + ctx.trans = trans; + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto cleanup; + new_roots = ctx.roots; if (qgroup_to_skip) { ulist_del(new_roots, qgroup_to_skip, 0); ulist_del(record->old_roots, qgroup_to_skip, @@ -3242,7 +3249,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_key found; struct extent_buffer *scratch_leaf = NULL; - struct ulist *roots = NULL; u64 num_bytes; bool done; int slot; @@ -3292,6 +3298,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, mutex_unlock(&fs_info->qgroup_rescan_lock); for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_item_key_to_cpu(scratch_leaf, &found, slot); if (found.type != BTRFS_EXTENT_ITEM_KEY && found.type != BTRFS_METADATA_ITEM_KEY) @@ -3301,13 +3309,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans, else num_bytes = found.offset; - ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, - &roots, false); + ctx.bytenr = found.objectid; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret < 0) goto out; /* For rescan, just pass old_roots as NULL */ ret = btrfs_qgroup_account_extent(trans, found.objectid, - num_bytes, NULL, roots); + num_bytes, NULL, ctx.roots); if (ret < 0) goto out; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 45690f7b5900..2ecca24e1001 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3408,24 +3408,27 @@ int add_data_references(struct reloc_control *rc, struct btrfs_path *path, struct rb_root *blocks) { - struct btrfs_fs_info *fs_info = rc->extent_root->fs_info; - struct ulist *leaves = NULL; + struct btrfs_backref_walk_ctx ctx = { 0 }; struct ulist_iterator leaf_uiter; struct ulist_node *ref_node = NULL; - const u32 blocksize = fs_info->nodesize; + const u32 blocksize = rc->extent_root->fs_info->nodesize; int ret = 0; btrfs_release_path(path); - ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid, - 0, &leaves, BTRFS_IGNORE_EXTENT_OFFSET); + + ctx.bytenr = extent_key->objectid; + ctx.ignore_extent_item_pos = true; + ctx.fs_info = rc->extent_root->fs_info; + + ret = btrfs_find_all_leafs(&ctx); if (ret < 0) return ret; ULIST_ITER_INIT(&leaf_uiter); - while ((ref_node = ulist_next(leaves, &leaf_uiter))) { + while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { struct extent_buffer *eb; - eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL); + eb = read_tree_block(ctx.fs_info, ref_node->val, 0, 0, 0, NULL); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; @@ -3441,7 +3444,7 @@ int add_data_references(struct reloc_control *rc, } if (ret < 0) free_block_list(blocks); - ulist_free(leaves); + ulist_free(ctx.refs); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 6c7dc89709fc..288a97992aa4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -909,7 +909,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) struct btrfs_extent_item *ei; struct scrub_warning swarn; unsigned long ptr = 0; - u64 extent_item_pos; u64 flags = 0; u64 ref_root; u32 item_size; @@ -941,7 +940,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) if (ret < 0) goto out; - extent_item_pos = swarn.logical - found_key.objectid; swarn.extent_item_size = found_key.offset; eb = path->nodes[0]; @@ -964,12 +962,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) } while (ret != 1); btrfs_release_path(path); } else { + struct btrfs_backref_walk_ctx ctx = { 0 }; + btrfs_release_path(path); + + ctx.bytenr = found_key.objectid; + ctx.extent_item_pos = swarn.logical - found_key.objectid; + ctx.fs_info = fs_info; + swarn.path = path; swarn.dev = dev; - iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, - scrub_print_warning_inode, &swarn); + + iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn); } out: diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6bf06939f891..0c9a9933341e 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1356,12 +1356,12 @@ static int find_extent_clone(struct send_ctx *sctx, u64 logical; u64 disk_byte; u64 num_bytes; - u64 extent_item_pos; u64 extent_refs; u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; - struct backref_ctx backref_ctx = {0}; + struct backref_ctx backref_ctx = { 0 }; + struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; struct btrfs_key found_key; struct btrfs_path *tmp_path; @@ -1461,14 +1461,13 @@ static int find_extent_clone(struct send_ctx *sctx, /* * Now collect all backrefs. */ + backref_walk_ctx.bytenr = found_key.objectid; if (compressed == BTRFS_COMPRESS_NONE) - extent_item_pos = logical - found_key.objectid; - else - extent_item_pos = 0; - ret = iterate_extent_inodes(fs_info, found_key.objectid, - extent_item_pos, 1, __iterate_backrefs, - &backref_ctx); + backref_walk_ctx.extent_item_pos = logical - found_key.objectid; + backref_walk_ctx.fs_info = fs_info; + ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs, + &backref_ctx); if (ret < 0) goto out; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 65b65d55d1f6..3fc8dc3fd980 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -205,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, static int test_no_shared_qgroup(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -220,16 +221,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + /* * Since the test trans doesn't have the complicated delayed refs, * we can only call btrfs_qgroup_account_extent() directly to test * quota. */ - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); @@ -238,12 +245,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -262,11 +271,13 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_item(root, nodesize, nodesize); if (ret) { @@ -274,12 +285,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -304,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root, static int test_multiple_refs(struct btrfs_root *root, u32 sectorsize, u32 nodesize) { + struct btrfs_backref_walk_ctx ctx = { 0 }; struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; struct ulist *old_roots = NULL; @@ -324,11 +338,17 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ctx.bytenr = nodesize; + ctx.trans = &trans; + ctx.fs_info = fs_info; + + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = insert_normal_tree_ref(root, nodesize, nodesize, 0, BTRFS_FS_TREE_OBJECTID); @@ -337,12 +357,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -357,11 +379,13 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = add_tree_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); @@ -370,12 +394,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); @@ -396,11 +422,13 @@ static int test_multiple_refs(struct btrfs_root *root, return -EINVAL; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { test_err("couldn't find old roots: %d", ret); return ret; } + old_roots = ctx.roots; + ctx.roots = NULL; ret = remove_extent_ref(root, nodesize, nodesize, 0, BTRFS_FIRST_FREE_OBJECTID); @@ -409,12 +437,14 @@ static int test_multiple_refs(struct btrfs_root *root, return ret; } - ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false); + ret = btrfs_find_all_roots(&ctx, false); if (ret) { ulist_free(old_roots); test_err("couldn't find old roots: %d", ret); return ret; } + new_roots = ctx.roots; + ctx.roots = NULL; ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots, new_roots); From 1baea6f18abf34037169d3b0c585356abb395376 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:48 +0000 Subject: [PATCH 153/249] btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() At iterate_extent_inodes() we collect a ulist of leaves for a given extent with a call to btrfs_find_all_leafs() and then we enter a loop where we iterate over all the collected leaves. Each iteration of that loop does a call to btrfs_find_all_roots_safe(), to determine all roots from which a leaf is accessible, and that results in allocating and releasing a ulist to store the root IDs. Instead of allocating and releasing the roots ulist on every iteration, allocate a ulist before entering the loop and keep using it on each iteration, reinitializing the ulist at the end of each iteration. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 46 +++++++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index a1a00a7bdba5..dc276ce3afe0 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1674,32 +1674,36 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) * the current while iterating. The process stops when we reach the end of the * list. * - * Found roots are added to @ctx->roots, which is allocated by this function and - * @ctx->roots should be NULL when calling this function. This function also - * requires @ctx->refs to be NULL, as it uses it for allocating a ulist to do - * temporary work, and frees it before returning. + * Found roots are added to @ctx->roots, which is allocated by this function if + * it points to NULL, in which case the caller is responsible for freeing it + * after it's not needed anymore. + * This function requires @ctx->refs to be NULL, as it uses it for allocating a + * ulist to do temporary work, and frees it before returning. * - * Returns 0 on success, < 0 on error. On error @ctx->roots is always NULL. + * Returns 0 on success, < 0 on error. */ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) { const u64 orig_bytenr = ctx->bytenr; const bool orig_ignore_extent_item_pos = ctx->ignore_extent_item_pos; + bool roots_ulist_allocated = false; struct ulist_iterator uiter; int ret = 0; ASSERT(ctx->refs == NULL); - ASSERT(ctx->roots == NULL); ctx->refs = ulist_alloc(GFP_NOFS); if (!ctx->refs) return -ENOMEM; - ctx->roots = ulist_alloc(GFP_NOFS); if (!ctx->roots) { - ulist_free(ctx->refs); - ctx->refs = NULL; - return -ENOMEM; + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ulist_free(ctx->refs); + ctx->refs = NULL; + return -ENOMEM; + } + roots_ulist_allocated = true; } ctx->ignore_extent_item_pos = true; @@ -1710,8 +1714,10 @@ static int btrfs_find_all_roots_safe(struct btrfs_backref_walk_ctx *ctx) ret = find_parent_nodes(ctx, NULL); if (ret < 0 && ret != -ENOENT) { - ulist_free(ctx->roots); - ctx->roots = NULL; + if (roots_ulist_allocated) { + ulist_free(ctx->roots); + ctx->roots = NULL; + } break; } ret = 0; @@ -2295,6 +2301,11 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, ctx->bytenr); ASSERT(ctx->trans == NULL); + ASSERT(ctx->roots == NULL); + + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) + return -ENOMEM; if (!search_commit_root) { struct btrfs_trans_handle *trans; @@ -2302,8 +2313,11 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && - PTR_ERR(trans) != -EROFS) + PTR_ERR(trans) != -EROFS) { + ulist_free(ctx->roots); + ctx->roots = NULL; return PTR_ERR(trans); + } trans = NULL; } ctx->trans = trans; @@ -2344,8 +2358,7 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, root_node->val, ctx->bytenr, iterate, user_ctx); } - ulist_free(ctx->roots); - ctx->roots = NULL; + ulist_reinit(ctx->roots); } free_leaf_list(refs); @@ -2358,6 +2371,9 @@ out: up_read(&ctx->fs_info->commit_root_sem); } + ulist_free(ctx->roots); + ctx->roots = NULL; + return ret; } From fa104a879073f3974d673ad8c609d986042cf666 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:49 +0000 Subject: [PATCH 154/249] btrfs: constify ulist parameter of ulist_next() The ulist_next() iterator function does not need to change the given ulist so make it const. This will allow the next patch in the series to pass a ulist to a function that does not need, and should not, modify the ulist. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ulist.c | 2 +- fs/btrfs/ulist.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 13af1e41f9d7..33606025513d 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -266,7 +266,7 @@ int ulist_del(struct ulist *ulist, u64 val, u64 aux) * It is allowed to call ulist_add during an enumeration. Newly added items * are guaranteed to show up in the running enumeration. */ -struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter) { struct ulist_node *node; diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 02fda0a2d4ce..b2cef187ea8e 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -66,7 +66,7 @@ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, #endif } -struct ulist_node *ulist_next(struct ulist *ulist, +struct ulist_node *ulist_next(const struct ulist *ulist, struct ulist_iterator *uiter); #define ULIST_ITER_INIT(uiter) ((uiter)->cur_list = NULL) From 66d04209e5a8d3fc47900de6bd0c319790a52b3e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:50 +0000 Subject: [PATCH 155/249] btrfs: send: cache leaf to roots mapping during backref walking During a send operation, when doing backref walking to determine which inodes/offsets/roots we can clone from, the most repetitive and expensive step is to map each leaf that has file extent items pointing to the target data extent to the IDs of the roots from which the leaves are accessible, which happens at iterate_extent_inodes(). That step requires finding every parent node of a leaf, then the parent of each parent, and so on until we reach a root node. So it's a naturally expensive operation, and repetitive because each leaf can have hundreds of file extent items (for a nodesize of 16K, that can be slightly over 200 file extent items). There's also temporal locality, as we process all file extent items from a leave before moving the next leaf. This change caches the mapping of leaves to root IDs, to avoid repeating those computations over and over again. The cache is limited to a maximum of 128 entries, with each entry being a struct with a size of 128 bytes, so the maximum cache size is 16K plus any nodes internally allocated by the maple tree that is used to index pointers to those structs. The cache is invalidated whenever we detect relocation happened since we started filling the cache, because if relocation happened then extent buffers for leaves and nodes of the trees used by a send operation may have been reallocated. This cache also allows for another important optimization that is introduced in the next patch in the series. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source Performance test results are in the changelog of patch 17/17. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 52 ++++++++++--- fs/btrfs/backref.h | 11 +++ fs/btrfs/send.c | 185 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 236 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index dc276ce3afe0..9dacf487017d 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2303,21 +2303,14 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->trans == NULL); ASSERT(ctx->roots == NULL); - ctx->roots = ulist_alloc(GFP_NOFS); - if (!ctx->roots) - return -ENOMEM; - if (!search_commit_root) { struct btrfs_trans_handle *trans; trans = btrfs_attach_transaction(ctx->fs_info->tree_root); if (IS_ERR(trans)) { if (PTR_ERR(trans) != -ENOENT && - PTR_ERR(trans) != -EROFS) { - ulist_free(ctx->roots); - ctx->roots = NULL; + PTR_ERR(trans) != -EROFS) return PTR_ERR(trans); - } trans = NULL; } ctx->trans = trans; @@ -2338,23 +2331,58 @@ int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { + const u64 leaf_bytenr = ref_node->val; struct ulist_node *root_node; struct ulist_iterator root_uiter; + struct extent_inode_elem *inode_list; - ctx->bytenr = ref_node->val; + inode_list = (struct extent_inode_elem *)(uintptr_t)ref_node->aux; + + if (ctx->cache_lookup) { + const u64 *root_ids; + int root_count; + bool cached; + + cached = ctx->cache_lookup(leaf_bytenr, ctx->user_ctx, + &root_ids, &root_count); + if (cached) { + for (int i = 0; i < root_count; i++) { + ret = iterate_leaf_refs(ctx->fs_info, + inode_list, + root_ids[i], + leaf_bytenr, + iterate, + user_ctx); + if (ret) + break; + } + continue; + } + } + + if (!ctx->roots) { + ctx->roots = ulist_alloc(GFP_NOFS); + if (!ctx->roots) { + ret = -ENOMEM; + break; + } + } + + ctx->bytenr = leaf_bytenr; ret = btrfs_find_all_roots_safe(ctx); if (ret) break; + if (ctx->cache_store) + ctx->cache_store(leaf_bytenr, ctx->roots, ctx->user_ctx); + ULIST_ITER_INIT(&root_uiter); while (!ret && (root_node = ulist_next(ctx->roots, &root_uiter))) { btrfs_debug(ctx->fs_info, "root %llu references leaf %llu, data list %#llx", root_node->val, ref_node->val, ref_node->aux); - ret = iterate_leaf_refs(ctx->fs_info, - (struct extent_inode_elem *) - (uintptr_t)ref_node->aux, + ret = iterate_leaf_refs(ctx->fs_info, inode_list, root_node->val, ctx->bytenr, iterate, user_ctx); } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ce700dfcc016..5005f579f235 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -69,6 +69,17 @@ struct btrfs_backref_walk_ctx { * about collecting root IDs. */ struct ulist *roots; + /* + * Used by iterate_extent_inodes(). Lookup and store functions for an + * optional cache which maps the logical address (bytenr) of leaves + * to an array of root IDs. + */ + bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, + const u64 **root_ids_ret, int *root_count_ret); + void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, + void *user_ctx); + /* Context object to pass to @cache_lookup and @cache_store. */ + void *user_ctx; }; struct inode_fs_paths { diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 0c9a9933341e..fa94bd550564 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -83,6 +83,39 @@ struct clone_root { #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 #define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2) +/* + * Limit the root_ids array of struct backref_cache_entry to 12 elements. + * This makes the size of a cache entry to be exactly 128 bytes on x86_64. + * The most common case is to have a single root for cloning, which corresponds + * to the send root. Having the user specify more than 11 clone roots is not + * common, and in such rare cases we simply don't use caching if the number of + * cloning roots that lead down to a leaf is more than 12. + */ +#define SEND_MAX_BACKREF_CACHE_ROOTS 12 + +/* + * Max number of entries in the cache. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding + * maple tree's internal nodes, is 16K. + */ +#define SEND_MAX_BACKREF_CACHE_SIZE 128 + +/* + * A backref cache entry maps a leaf to a list of IDs of roots from which the + * leaf is accessible and we can use for clone operations. + * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on + * x86_64). + */ +struct backref_cache_entry { + /* List to link to the cache's lru list. */ + struct list_head list; + /* The key for this entry in the cache. */ + u64 key; + u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS]; + /* Number of valid elements in the root_ids array. */ + int num_roots; +}; + struct send_ctx { struct file *send_filp; loff_t send_off; @@ -251,6 +284,14 @@ struct send_ctx { struct rb_root rbtree_new_refs; struct rb_root rbtree_deleted_refs; + + struct { + u64 last_reloc_trans; + struct list_head lru_list; + struct maple_tree entries; + /* Number of entries stored in the cache. */ + int size; + } backref_cache; }; struct pending_dir_move { @@ -1335,6 +1376,142 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, return 0; } +static void empty_backref_cache(struct send_ctx *sctx) +{ + struct backref_cache_entry *entry; + struct backref_cache_entry *tmp; + + list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list) + kfree(entry); + + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mtree_destroy(&sctx->backref_cache.entries); + sctx->backref_cache.size = 0; +} + +static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, + const u64 **root_ids_ret, int *root_count_ret) +{ + struct send_ctx *sctx = ctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; + struct backref_cache_entry *entry; + + if (sctx->backref_cache.size == 0) + return false; + + /* + * If relocation happened since we first filled the cache, then we must + * empty the cache and can not use it, because even though we operate on + * read-only roots, their leaves and nodes may have been reallocated and + * now be used for different nodes/leaves of the same tree or some other + * tree. + * + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) { + empty_backref_cache(sctx); + return false; + } + + entry = mtree_load(&sctx->backref_cache.entries, key); + if (!entry) + return false; + + *root_ids_ret = entry->root_ids; + *root_count_ret = entry->num_roots; + list_move_tail(&entry->list, &sctx->backref_cache.lru_list); + + return true; +} + +static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, + void *ctx) +{ + struct send_ctx *sctx = ctx; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + struct backref_cache_entry *new_entry; + struct ulist_iterator uiter; + struct ulist_node *node; + int ret; + + /* + * We're called while holding a transaction handle or while holding + * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a + * NOFS allocation. + */ + new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS); + /* No worries, cache is optional. */ + if (!new_entry) + return; + + new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits; + new_entry->num_roots = 0; + ULIST_ITER_INIT(&uiter); + while ((node = ulist_next(root_ids, &uiter)) != NULL) { + const u64 root_id = node->val; + struct clone_root *root; + + root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots, + sctx->clone_roots_cnt, sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!root) + continue; + + /* Too many roots, just exit, no worries as caching is optional. */ + if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) { + kfree(new_entry); + return; + } + + new_entry->root_ids[new_entry->num_roots] = root_id; + new_entry->num_roots++; + } + + /* + * We may have not added any roots to the new cache entry, which means + * none of the roots is part of the list of roots from which we are + * allowed to clone. Cache the new entry as it's still useful to avoid + * backref walking to determine which roots have a path to the leaf. + */ + + if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) { + struct backref_cache_entry *lru_entry; + struct backref_cache_entry *mt_entry; + + lru_entry = list_first_entry(&sctx->backref_cache.lru_list, + struct backref_cache_entry, list); + mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key); + ASSERT(mt_entry == lru_entry); + list_del(&mt_entry->list); + kfree(mt_entry); + sctx->backref_cache.size--; + } + + ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key, + new_entry, GFP_NOFS); + ASSERT(ret == 0 || ret == -ENOMEM); + if (ret) { + /* Caching is optional, no worries. */ + kfree(new_entry); + return; + } + + list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list); + + /* + * We are called from iterate_extent_inodes() while either holding a + * transaction handle or holding fs_info->commit_root_sem, so no need + * to take any lock here. + */ + if (sctx->backref_cache.size == 0) + sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans; + + sctx->backref_cache.size++; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1465,6 +1642,9 @@ static int find_extent_clone(struct send_ctx *sctx, if (compressed == BTRFS_COMPRESS_NONE) backref_walk_ctx.extent_item_pos = logical - found_key.objectid; backref_walk_ctx.fs_info = fs_info; + backref_walk_ctx.cache_lookup = lookup_backref_cache; + backref_walk_ctx.cache_store = store_backref_cache; + backref_walk_ctx.user_ctx = sctx; ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs, &backref_ctx); @@ -7891,6 +8071,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL); INIT_LIST_HEAD(&sctx->name_cache_list); + INIT_LIST_HEAD(&sctx->backref_cache.lru_list); + mt_init(&sctx->backref_cache.entries); + sctx->flags = arg->flags; if (arg->flags & BTRFS_SEND_FLAG_VERSION) { @@ -8153,6 +8336,8 @@ out: close_current_inode(sctx); + empty_backref_cache(sctx); + kfree(sctx); } From 88ffb665c894b1929b30b09e05506ff359d9fb89 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:51 +0000 Subject: [PATCH 156/249] btrfs: send: skip unnecessary backref iterations When looking for a clone source for an extent, we are iterating over all the backreferences for an extent. This is often a waste of time, because once we find a good clone source we could stop immediately instead of continuing backref walking, which is expensive. Basically what happens currently is this: 1) Call iterate_extent_inodes() to iterate over all the backreferences; 2) It calls btrfs_find_all_leafs() which in turn calls the main function to walk over backrefs and collect them - find_parent_nodes(); 3) Then we collect all the references for our target data extent from the extent tree (and delayed refs if any), add them to the rb trees, resolve all the indirect backreferences and search for all the file extent items in fs trees, building a list of inodes for each one of them (struct extent_inode_elem); 4) Then back at iterate_extent_inodes() we find all the roots associated to each found leaf, and call the callback __iterate_backrefs defined at send.c for each inode in the inode list associated to each leaf. Some times one the first backreferences we find in a fs tree is optimal to satisfy the clone operation that send wants to perform, and in that case we could stop immediately and avoid resolving all the remaining indirect backreferences (search fs trees for the respective file extent items, etc). This possibly if when we find a fs tree leaf with a file extent item we are able to know what are all the roots that can lead to the leaf - this is now possible after the previous patch in the series that adds a cache that maps leaves to a list of roots. So we can now shortcircuit backref walking during send, by having the callback we pass to iterate_extent_inodes() to be called when we find a file extent item for an indirect backreference, and have it return a special value when it found a suitable backreference and it does not need to look for more backreferences. This change does that. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source Performance test results are in the changelog of patch 17/17. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 73 ++++++++++++++++++++++++++++------------- fs/btrfs/backref.h | 44 +++++++++++++++++++++---- fs/btrfs/send.c | 81 +++++++++++++++++++++++----------------------- 3 files changed, 128 insertions(+), 70 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 9dacf487017d..bb59ba68d7ee 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -31,15 +31,18 @@ struct extent_inode_elem { struct extent_inode_elem *next; }; -static int check_extent_in_eb(const struct btrfs_key *key, +static int check_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct btrfs_key *key, const struct extent_buffer *eb, const struct btrfs_file_extent_item *fi, - u64 extent_item_pos, struct extent_inode_elem **eie) { const u64 data_len = btrfs_file_extent_num_bytes(eb, fi); - u64 offset = 0; + u64 offset = key->offset; struct extent_inode_elem *e; + const u64 *root_ids; + int root_count; + bool cached; if (!btrfs_file_extent_compression(eb, fi) && !btrfs_file_extent_encryption(eb, fi) && @@ -48,19 +51,38 @@ static int check_extent_in_eb(const struct btrfs_key *key, data_offset = btrfs_file_extent_offset(eb, fi); - if (extent_item_pos < data_offset || - extent_item_pos >= data_offset + data_len) + if (ctx->extent_item_pos < data_offset || + ctx->extent_item_pos >= data_offset + data_len) return 1; - offset = extent_item_pos - data_offset; + offset += ctx->extent_item_pos - data_offset; } + if (!ctx->indirect_ref_iterator || !ctx->cache_lookup) + goto add_inode_elem; + + cached = ctx->cache_lookup(eb->start, ctx->user_ctx, &root_ids, + &root_count); + if (!cached) + goto add_inode_elem; + + for (int i = 0; i < root_count; i++) { + int ret; + + ret = ctx->indirect_ref_iterator(key->objectid, offset, + data_len, root_ids[i], + ctx->user_ctx); + if (ret) + return ret; + } + +add_inode_elem: e = kmalloc(sizeof(*e), GFP_NOFS); if (!e) return -ENOMEM; e->next = *eie; e->inum = key->objectid; - e->offset = key->offset + offset; + e->offset = offset; e->num_bytes = data_len; *eie = e; @@ -77,8 +99,8 @@ static void free_inode_elem_list(struct extent_inode_elem *eie) } } -static int find_extent_in_eb(const struct extent_buffer *eb, - u64 wanted_disk_byte, u64 extent_item_pos, +static int find_extent_in_eb(struct btrfs_backref_walk_ctx *ctx, + const struct extent_buffer *eb, struct extent_inode_elem **eie) { u64 disk_byte; @@ -105,11 +127,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb, continue; /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */ disk_byte = btrfs_file_extent_disk_bytenr(eb, fi); - if (disk_byte != wanted_disk_byte) + if (disk_byte != ctx->bytenr) continue; - ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie); - if (ret < 0) + ret = check_extent_in_eb(ctx, &key, eb, fi, eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) return ret; } @@ -526,9 +548,9 @@ static int add_all_parents(struct btrfs_backref_walk_ctx *ctx, else goto next; if (!ctx->ignore_extent_item_pos) { - ret = check_extent_in_eb(&key, eb, fi, - ctx->extent_item_pos, &eie); - if (ret < 0) + ret = check_extent_in_eb(ctx, &key, eb, fi, &eie); + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) break; } if (ret > 0) @@ -551,10 +573,11 @@ next: ret = btrfs_next_old_item(root, path, ctx->time_seq); } - if (ret > 0) - ret = 0; - else if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); + else if (ret > 0) + ret = 0; + return ret; } @@ -1570,12 +1593,12 @@ again: if (!path->skip_locking) btrfs_tree_read_lock(eb); - ret = find_extent_in_eb(eb, ctx->bytenr, - ctx->extent_item_pos, &eie); + ret = find_extent_in_eb(ctx, eb, &eie); if (!path->skip_locking) btrfs_tree_read_unlock(eb); free_extent_buffer(eb); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + ret < 0) goto out; ref->inode_list = eie; /* @@ -1628,7 +1651,7 @@ out: prelim_release(&preftrees.indirect); prelim_release(&preftrees.indirect_missing_keys); - if (ret < 0) + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || ret < 0) free_inode_elem_list(eie); return ret; } @@ -1654,7 +1677,8 @@ int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx) return -ENOMEM; ret = find_parent_nodes(ctx, NULL); - if (ret < 0 && ret != -ENOENT) { + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP || + (ret < 0 && ret != -ENOENT)) { free_leaf_list(ctx->refs); ctx->refs = NULL; return ret; @@ -2402,6 +2426,9 @@ out: ulist_free(ctx->roots); ctx->roots = NULL; + if (ret == BTRFS_ITERATE_EXTENT_INODES_STOP) + ret = 0; + return ret; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 5005f579f235..a80d1e8be718 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -12,6 +12,25 @@ #include "disk-io.h" #include "extent_io.h" +/* + * Used by implementations of iterate_extent_inodes_t (see definition below) to + * signal that backref iteration can stop immediately and no error happened. + * The value must be non-negative and must not be 0, 1 (which is a common return + * value from things like btrfs_search_slot() and used internally in the backref + * walking code) and different from BACKREF_FOUND_SHARED and + * BACKREF_FOUND_NOT_SHARED + */ +#define BTRFS_ITERATE_EXTENT_INODES_STOP 5 + +/* + * Should return 0 if no errors happened and iteration of backrefs should + * continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero + * value to immediately stop iteration and possibly signal an error back to + * the caller. + */ +typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, + u64 root, void *ctx); + /* * Context and arguments for backref walking functions. Some of the fields are * to be filled by the caller of such functions while other are filled by the @@ -70,15 +89,29 @@ struct btrfs_backref_walk_ctx { */ struct ulist *roots; /* - * Used by iterate_extent_inodes(). Lookup and store functions for an - * optional cache which maps the logical address (bytenr) of leaves - * to an array of root IDs. + * Used by iterate_extent_inodes() and the main backref walk code + * (find_parent_nodes()). Lookup and store functions for an optional + * cache which maps the logical address (bytenr) of leaves to an array + * of root IDs. */ bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx, const u64 **root_ids_ret, int *root_count_ret); void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids, void *user_ctx); - /* Context object to pass to @cache_lookup and @cache_store. */ + /* + * If this is not NULL, then the backref walking code will call this + * for each indirect data extent reference as soon as it finds one, + * before collecting all the remaining backrefs and before resolving + * indirect backrefs. This allows for the caller to terminate backref + * walking as soon as it finds one backref that matches some specific + * criteria. The @cache_lookup and @cache_store callbacks should not + * be NULL in order to use this callback. + */ + iterate_extent_inodes_t *indirect_ref_iterator; + /* + * Context object to pass to the @cache_lookup, @cache_store and + * @indirect_ref_iterator callbacks. + */ void *user_ctx; }; @@ -145,9 +178,6 @@ struct btrfs_backref_share_check_ctx { int prev_extents_cache_slot; }; -typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes, - u64 root, void *ctx); - struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void); void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index fa94bd550564..f453f6406564 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -77,7 +77,7 @@ struct clone_root { u64 ino; u64 offset; u64 num_bytes; - u64 found_refs; + bool found_ref; }; #define SEND_CTX_MAX_NAME_CACHE_SIZE 128 @@ -1281,9 +1281,6 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; - - /* Just to check for bugs in backref resolving */ - int found_itself; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1312,33 +1309,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2) /* * Called for every backref that is found for the current extent. - * Results are collected in sctx->clone_roots->ino/offset/found_refs + * Results are collected in sctx->clone_roots->ino/offset. */ -static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, - void *ctx_) +static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id, + void *ctx_) { struct backref_ctx *bctx = ctx_; - struct clone_root *found; + struct clone_root *clone_root; /* First check if the root is in the list of accepted clone sources */ - found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots, - bctx->sctx->clone_roots_cnt, - sizeof(struct clone_root), - __clone_root_cmp_bsearch); - if (!found) + clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots, + bctx->sctx->clone_roots_cnt, + sizeof(struct clone_root), + __clone_root_cmp_bsearch); + if (!clone_root) return 0; - if (found->root == bctx->sctx->send_root && + /* This is our own reference, bail out as we can't clone from it. */ + if (clone_root->root == bctx->sctx->send_root && ino == bctx->cur_objectid && - offset == bctx->cur_offset) { - bctx->found_itself = 1; - } + offset == bctx->cur_offset) + return 0; /* * Make sure we don't consider clones from send_root that are * behind the current inode/offset. */ - if (found->root == bctx->sctx->send_root) { + if (clone_root->root == bctx->sctx->send_root) { /* * If the source inode was not yet processed we can't issue a * clone operation, as the source extent does not exist yet at @@ -1359,7 +1356,7 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, } bctx->found++; - found->found_refs++; + clone_root->found_ref = true; /* * If the given backref refers to a file extent item with a larger @@ -1367,10 +1364,17 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root, * we clone more optimally and end up doing less writes and getting * less exclusive, non-shared extents at the destination. */ - if (num_bytes > found->num_bytes) { - found->ino = ino; - found->offset = offset; - found->num_bytes = num_bytes; + if (num_bytes > clone_root->num_bytes) { + clone_root->ino = ino; + clone_root->offset = offset; + clone_root->num_bytes = num_bytes; + + /* + * Found a perfect candidate, so there's no need to continue + * backref walking. + */ + if (num_bytes >= bctx->extent_len) + return BTRFS_ITERATE_EXTENT_INODES_STOP; } return 0; @@ -1392,7 +1396,8 @@ static void empty_backref_cache(struct send_ctx *sctx) static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, const u64 **root_ids_ret, int *root_count_ret) { - struct send_ctx *sctx = ctx; + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; const u64 key = leaf_bytenr >> fs_info->sectorsize_bits; struct backref_cache_entry *entry; @@ -1430,7 +1435,8 @@ static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx, static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, void *ctx) { - struct send_ctx *sctx = ctx; + struct backref_ctx *bctx = ctx; + struct send_ctx *sctx = bctx->sctx; struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct backref_cache_entry *new_entry; struct ulist_iterator uiter; @@ -1618,7 +1624,7 @@ static int find_extent_clone(struct send_ctx *sctx, cur_clone_root->ino = (u64)-1; cur_clone_root->offset = 0; cur_clone_root->num_bytes = 0; - cur_clone_root->found_refs = 0; + cur_clone_root->found_ref = false; } backref_ctx.sctx = sctx; @@ -1628,7 +1634,7 @@ static int find_extent_clone(struct send_ctx *sctx, /* * The last extent of a file may be too large due to page alignment. * We need to adjust extent_len in this case so that the checks in - * __iterate_backrefs work. + * iterate_backrefs() work. */ if (data_offset + num_bytes >= ino_size) backref_ctx.extent_len = ino_size - data_offset; @@ -1644,9 +1650,10 @@ static int find_extent_clone(struct send_ctx *sctx, backref_walk_ctx.fs_info = fs_info; backref_walk_ctx.cache_lookup = lookup_backref_cache; backref_walk_ctx.cache_store = store_backref_cache; - backref_walk_ctx.user_ctx = sctx; + backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.user_ctx = &backref_ctx; - ret = iterate_extent_inodes(&backref_walk_ctx, true, __iterate_backrefs, + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) goto out; @@ -1671,27 +1678,21 @@ static int find_extent_clone(struct send_ctx *sctx, } up_read(&fs_info->commit_root_sem); - if (!backref_ctx.found_itself) { - /* found a bug in backref code? */ - ret = -EIO; - btrfs_err(fs_info, - "did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu", - ino, data_offset, disk_byte, found_key.objectid); - goto out; - } - btrfs_debug(fs_info, "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", data_offset, ino, num_bytes, logical); - if (!backref_ctx.found) + if (!backref_ctx.found) { btrfs_debug(fs_info, "no clones found"); + ret = -ENOENT; + goto out; + } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { struct clone_root *clone_root = &sctx->clone_roots[i]; - if (clone_root->found_refs == 0) + if (!clone_root->found_ref) continue; /* From f73853c7168aef0e071c160979af05a148691e61 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:52 +0000 Subject: [PATCH 157/249] btrfs: send: avoid double extent tree search when finding clone source At find_extent_clone() we search twice for the extent item corresponding to the data extent that the current file extent items points to: 1) Once with a call to extent_from_logical(); 2) Once again during backref walking, through iterate_extent_inodes() which eventually leads to find_parent_nodes() where we will search again the extent tree for the same extent item. The extent tree can be huge, so doing this one extra search for every extent we want to send adds up and it's expensive. The first call is there since the send code was introduced and it accomplishes two things: 1) Check that the extent is flagged as a data extent in the extent tree. But it can not be anything else, otherwise we wouldn't have a file extent item in the send root pointing to it. This was probably added to catch bugs in the early days where send was yet too young and the interaction with everything else was far from perfect; 2) Check how many direct references there are on the extent, and if there's too many (more than SEND_MAX_EXTENT_REFS), avoid doing the backred walking as it may take too long and slowdown send. So improve on this by having a callback in the backref walking code that is called when it finds the extent item in the extent tree, and have those checks done in the callback. When the callback returns anything different from 0, it stops the backref walking code. This way we do a single search on the extent tree for the extent item of our data extent. Also, before this change we were only checking the number of references on the data extent against SEND_MAX_EXTENT_REFS, but after starting backref walking we will end up resolving backrefs for extent buffers in the path from a leaf having a file extent item pointing to our data extent, up to roots of trees from which the extent buffer is accessible from, due to shared subtrees resulting from snapshoting. We were therefore allowing for the possibility for send taking too long due to some node in the path from the leaf to a root node being shared too many times. After this change we check for reference counts being greater than SEND_MAX_EXTENT_REFS for both data extents and metadata extents. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source Performance test results are in the changelog of patch 17/17. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 31 ++++++++------ fs/btrfs/backref.h | 9 +++- fs/btrfs/send.c | 103 +++++++++++++++++++++------------------------ 3 files changed, 73 insertions(+), 70 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index bb59ba68d7ee..33056c4c0528 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1003,8 +1003,8 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_inline_refs(const struct btrfs_fs_info *fs_info, - struct btrfs_path *path, u64 bytenr, +static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_path *path, int *info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -1029,6 +1029,13 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, BUG_ON(item_size < sizeof(*ei)); ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item); + + if (ctx->check_extent_item) { + ret = ctx->check_extent_item(ctx->bytenr, ei, leaf, ctx->user_ctx); + if (ret) + return ret; + } + flags = btrfs_extent_flags(leaf, ei); btrfs_item_key_to_cpu(leaf, &found_key, slot); @@ -1064,9 +1071,9 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, switch (type) { case BTRFS_SHARED_BLOCK_REF_KEY: - ret = add_direct_ref(fs_info, preftrees, + ret = add_direct_ref(ctx->fs_info, preftrees, *info_level + 1, offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { struct btrfs_shared_data_ref *sdref; @@ -1075,14 +1082,14 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, sdref = (struct btrfs_shared_data_ref *)(iref + 1); count = btrfs_shared_data_ref_count(leaf, sdref); - ret = add_direct_ref(fs_info, preftrees, 0, offset, - bytenr, count, sc, GFP_NOFS); + ret = add_direct_ref(ctx->fs_info, preftrees, 0, offset, + ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: - ret = add_indirect_ref(fs_info, preftrees, offset, + ret = add_indirect_ref(ctx->fs_info, preftrees, offset, NULL, *info_level + 1, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { struct btrfs_extent_data_ref *dref; @@ -1104,8 +1111,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info, root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, + ret = add_indirect_ref(ctx->fs_info, preftrees, root, + &key, 0, ctx->bytenr, count, sc, GFP_NOFS); break; @@ -1455,8 +1462,8 @@ again: if (key.objectid == ctx->bytenr && (key.type == BTRFS_EXTENT_ITEM_KEY || key.type == BTRFS_METADATA_ITEM_KEY)) { - ret = add_inline_refs(ctx->fs_info, path, ctx->bytenr, - &info_level, &preftrees, sc); + ret = add_inline_refs(ctx, path, &info_level, + &preftrees, sc); if (ret) goto out; ret = add_keyed_refs(root, path, ctx->bytenr, info_level, diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index a80d1e8be718..1bd5a15c7f9e 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -109,9 +109,14 @@ struct btrfs_backref_walk_ctx { */ iterate_extent_inodes_t *indirect_ref_iterator; /* - * Context object to pass to the @cache_lookup, @cache_store and - * @indirect_ref_iterator callbacks. + * If this is not NULL, then the backref walking code will call this for + * each extent item it's meant to process before it actually starts + * processing it. If this returns anything other than 0, then it stops + * the backref walking code immediately. */ + int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *user_ctx); + /* Context object to pass to the callbacks defined above. */ void *user_ctx; }; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f453f6406564..516b80637bfb 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1281,6 +1281,9 @@ struct backref_ctx { /* may be truncated in case it's the last extent in a file */ u64 extent_len; + + /* The bytenr the file extent item we are processing refers to. */ + u64 bytenr; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1518,6 +1521,43 @@ static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids, sctx->backref_cache.size++; } +static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, + const struct extent_buffer *leaf, void *ctx) +{ + const u64 refs = btrfs_extent_refs(leaf, ei); + const struct backref_ctx *bctx = ctx; + const struct send_ctx *sctx = bctx->sctx; + + if (bytenr == bctx->bytenr) { + const u64 flags = btrfs_extent_flags(leaf, ei); + + if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) + return -EUCLEAN; + + /* + * If we have only one reference and only the send root as a + * clone source - meaning no clone roots were given in the + * struct btrfs_ioctl_send_args passed to the send ioctl - then + * it's our reference and there's no point in doing backref + * walking which is expensive, so exit early. + */ + if (refs == 1 && sctx->clone_roots_cnt == 1) + return -ENOENT; + } + + /* + * Backreference walking (iterate_extent_inodes() below) is currently + * too expensive when an extent has a large number of references, both + * in time spent and used memory. So for now just fallback to write + * operations instead of clone operations when an extent has more than + * a certain amount of references. + */ + if (refs > SEND_MAX_EXTENT_REFS) + return -ENOENT; + + return 0; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1539,16 +1579,11 @@ static int find_extent_clone(struct send_ctx *sctx, u64 logical; u64 disk_byte; u64 num_bytes; - u64 extent_refs; - u64 flags = 0; struct btrfs_file_extent_item *fi; struct extent_buffer *eb = path->nodes[0]; struct backref_ctx backref_ctx = { 0 }; struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 }; struct clone_root *cur_clone_root; - struct btrfs_key found_key; - struct btrfs_path *tmp_path; - struct btrfs_extent_item *ei; int compressed; u32 i; @@ -1574,48 +1609,6 @@ static int find_extent_clone(struct send_ctx *sctx, num_bytes = btrfs_file_extent_num_bytes(eb, fi); logical = disk_byte + btrfs_file_extent_offset(eb, fi); - tmp_path = alloc_path_for_send(); - if (!tmp_path) - return -ENOMEM; - - /* We only use this path under the commit sem */ - tmp_path->need_commit_sem = 0; - - down_read(&fs_info->commit_root_sem); - ret = extent_from_logical(fs_info, disk_byte, tmp_path, - &found_key, &flags); - up_read(&fs_info->commit_root_sem); - - if (ret < 0) - goto out; - if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - ret = -EIO; - goto out; - } - - ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0], - struct btrfs_extent_item); - extent_refs = btrfs_extent_refs(tmp_path->nodes[0], ei); - /* - * Backreference walking (iterate_extent_inodes() below) is currently - * too expensive when an extent has a large number of references, both - * in time spent and used memory. So for now just fallback to write - * operations instead of clone operations when an extent has more than - * a certain amount of references. - * - * Also, if we have only one reference and only the send root as a clone - * source - meaning no clone roots were given in the struct - * btrfs_ioctl_send_args passed to the send ioctl - then it's our - * reference and there's no point in doing backref walking which is - * expensive, so exit early. - */ - if ((extent_refs == 1 && sctx->clone_roots_cnt == 1) || - extent_refs > SEND_MAX_EXTENT_REFS) { - ret = -ENOENT; - goto out; - } - btrfs_release_path(tmp_path); - /* * Setup the clone roots. */ @@ -1630,6 +1623,7 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx.sctx = sctx; backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; + backref_ctx.bytenr = disk_byte; /* * The last extent of a file may be too large due to page alignment. @@ -1644,19 +1638,20 @@ static int find_extent_clone(struct send_ctx *sctx, /* * Now collect all backrefs. */ - backref_walk_ctx.bytenr = found_key.objectid; + backref_walk_ctx.bytenr = disk_byte; if (compressed == BTRFS_COMPRESS_NONE) - backref_walk_ctx.extent_item_pos = logical - found_key.objectid; + backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi); backref_walk_ctx.fs_info = fs_info; backref_walk_ctx.cache_lookup = lookup_backref_cache; backref_walk_ctx.cache_store = store_backref_cache; backref_walk_ctx.indirect_ref_iterator = iterate_backrefs; + backref_walk_ctx.check_extent_item = check_extent_item; backref_walk_ctx.user_ctx = &backref_ctx; ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) - goto out; + return ret; down_read(&fs_info->commit_root_sem); if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { @@ -1673,8 +1668,7 @@ static int find_extent_clone(struct send_ctx *sctx, * was already reallocated after the relocation. */ up_read(&fs_info->commit_root_sem); - ret = -ENOENT; - goto out; + return -ENOENT; } up_read(&fs_info->commit_root_sem); @@ -1684,8 +1678,7 @@ static int find_extent_clone(struct send_ctx *sctx, if (!backref_ctx.found) { btrfs_debug(fs_info, "no clones found"); - ret = -ENOENT; - goto out; + return -ENOENT; } cur_clone_root = NULL; @@ -1720,8 +1713,6 @@ static int find_extent_clone(struct send_ctx *sctx, ret = -ENOENT; } -out: - btrfs_free_path(tmp_path); return ret; } From adf0241868bd46c7be8012ef99cb88c7f47c16ce Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:53 +0000 Subject: [PATCH 158/249] btrfs: send: skip resolution of our own backref when finding clone source When doing backref walking to determine a source range to clone from, it is worthless to collect and resolve our own data backref, as we can't obviously use it as a clone source and it represents the range we want to clone into. Collecting the backref implies doing the extra work to resolve it, doing the search for a file extent item in a subvolume tree, etc. Skipping the data backref is valid as long as we only have the send root as the single clone root, otherwise the leaf with the file extent item may be accessible from another clone root due to shared subtrees created by snapshots, and therefore we have to collect the backref and resolve it. So add a callback to the backref walking code to guide it to skip data backrefs. This change is part of a patchset comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source The following test was run on non-debug kernel (Debian's default kernel config) before and after applying the patchset: $ cat test-send-many-shared-extents.sh #!/bin/bash DEV=/dev/sdh MNT=/mnt/sdh umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT num_files=50000 num_clones_per_file=50 for ((i = 1; i <= $num_files; i++)); do xfs_io -f -c "pwrite 0 64K" $MNT/file_$i > /dev/null echo -ne "\r$i files created..." done echo btrfs subvolume snapshot -r $MNT $MNT/snap1 cloned=0 for ((i = 1; i <= $num_clones_per_file; i++)); do for ((j = 1; j <= $num_files; j++)); do cp --reflink=always $MNT/file_$j $MNT/file_${j}_clone_${i} cloned=$((cloned + 1)) echo -ne "\r$cloned / $((num_files * num_clones_per_file)) clone operations" done done echo btrfs subvolume snapshot -r $MNT $MNT/snap2 # Unmount and mount again to clear all cached metadata (and data). umount $DEV mount $DEV $MNT start=$(date +%s%N) btrfs send $MNT/snap2 > /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000000 )) echo -e "\nFull send took $dur seconds" # Unmount and mount again to clear all cached metadata (and data). umount $DEV mount $DEV $MNT start=$(date +%s%N) btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000000 )) echo -e "\nIncremental send took $dur seconds" umount $MNT Before applying the patchset: (...) Full send took 1108 seconds (...) Incremental send took 1135 seconds After applying the whole patchset: (...) Full send took 268 seconds (-75.8%) (...) Incremental send took 316 seconds (-72.2%) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/backref.c | 35 +++++++++++++++++++++-------------- fs/btrfs/backref.h | 9 +++++++++ fs/btrfs/send.c | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 33056c4c0528..430974cf3b96 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1111,10 +1111,12 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(ctx->fs_info, preftrees, root, - &key, 0, ctx->bytenr, count, - sc, GFP_NOFS); - + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(ctx->fs_info, preftrees, + root, &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1133,8 +1135,9 @@ static int add_inline_refs(struct btrfs_backref_walk_ctx *ctx, * * Returns 0 on success, <0 on error, or BACKREF_FOUND_SHARED. */ -static int add_keyed_refs(struct btrfs_root *extent_root, - struct btrfs_path *path, u64 bytenr, +static int add_keyed_refs(struct btrfs_backref_walk_ctx *ctx, + struct btrfs_root *extent_root, + struct btrfs_path *path, int info_level, struct preftrees *preftrees, struct share_check *sc) { @@ -1157,7 +1160,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (key.objectid != bytenr) + if (key.objectid != ctx->bytenr) break; if (key.type < BTRFS_TREE_BLOCK_REF_KEY) continue; @@ -1169,7 +1172,7 @@ static int add_keyed_refs(struct btrfs_root *extent_root, /* SHARED DIRECT METADATA backref */ ret = add_direct_ref(fs_info, preftrees, info_level + 1, key.offset, - bytenr, 1, NULL, GFP_NOFS); + ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_SHARED_DATA_REF_KEY: { /* SHARED DIRECT FULL backref */ @@ -1180,14 +1183,14 @@ static int add_keyed_refs(struct btrfs_root *extent_root, struct btrfs_shared_data_ref); count = btrfs_shared_data_ref_count(leaf, sdref); ret = add_direct_ref(fs_info, preftrees, 0, - key.offset, bytenr, count, + key.offset, ctx->bytenr, count, sc, GFP_NOFS); break; } case BTRFS_TREE_BLOCK_REF_KEY: /* NORMAL INDIRECT METADATA backref */ ret = add_indirect_ref(fs_info, preftrees, key.offset, - NULL, info_level + 1, bytenr, + NULL, info_level + 1, ctx->bytenr, 1, NULL, GFP_NOFS); break; case BTRFS_EXTENT_DATA_REF_KEY: { @@ -1211,9 +1214,13 @@ static int add_keyed_refs(struct btrfs_root *extent_root, } root = btrfs_extent_data_ref_root(leaf, dref); - ret = add_indirect_ref(fs_info, preftrees, root, - &key, 0, bytenr, count, - sc, GFP_NOFS); + + if (!ctx->skip_data_ref || + !ctx->skip_data_ref(root, key.objectid, key.offset, + ctx->user_ctx)) + ret = add_indirect_ref(fs_info, preftrees, root, + &key, 0, ctx->bytenr, + count, sc, GFP_NOFS); break; } default: @@ -1466,7 +1473,7 @@ again: &preftrees, sc); if (ret) goto out; - ret = add_keyed_refs(root, path, ctx->bytenr, info_level, + ret = add_keyed_refs(ctx, root, path, info_level, &preftrees, sc); if (ret) goto out; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 1bd5a15c7f9e..ef6bbea3f456 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -116,6 +116,15 @@ struct btrfs_backref_walk_ctx { */ int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei, const struct extent_buffer *leaf, void *user_ctx); + /* + * If this is not NULL, then the backref walking code will call this for + * each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before + * processing that data ref. If this callback return false, then it will + * ignore this data ref and it will never resolve the indirect data ref, + * saving time searching for leaves in a fs tree with file extent items + * matching the data ref. + */ + bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx); /* Context object to pass to the callbacks defined above. */ void *user_ctx; }; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 516b80637bfb..383bc8a5cb6c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1284,6 +1284,10 @@ struct backref_ctx { /* The bytenr the file extent item we are processing refers to. */ u64 bytenr; + /* The owner (root id) of the data backref for the current extent. */ + u64 backref_owner; + /* The offset of the data backref for the current extent. */ + u64 backref_offset; }; static int __clone_root_cmp_bsearch(const void *key, const void *elt) @@ -1558,6 +1562,18 @@ static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei, return 0; } +static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx) +{ + const struct backref_ctx *bctx = ctx; + + if (ino == bctx->cur_objectid && + root == bctx->backref_owner && + offset == bctx->backref_offset) + return true; + + return false; +} + /* * Given an inode, offset and extent item, it finds a good clone for a clone * instruction. Returns -ENOENT when none could be found. The function makes @@ -1624,6 +1640,12 @@ static int find_extent_clone(struct send_ctx *sctx, backref_ctx.cur_objectid = ino; backref_ctx.cur_offset = data_offset; backref_ctx.bytenr = disk_byte; + /* + * Use the header owner and not the send root's id, because in case of a + * snapshot we can have shared subtrees. + */ + backref_ctx.backref_owner = btrfs_header_owner(eb); + backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi); /* * The last extent of a file may be too large due to page alignment. @@ -1648,6 +1670,17 @@ static int find_extent_clone(struct send_ctx *sctx, backref_walk_ctx.check_extent_item = check_extent_item; backref_walk_ctx.user_ctx = &backref_ctx; + /* + * If have a single clone root, then it's the send root and we can tell + * the backref walking code to skip our own backref and not resolve it, + * since we can not use it for cloning - the source and destination + * ranges can't overlap and in case the leaf is shared through a subtree + * due to snapshots, we can't use those other roots since they are not + * in the list of clone roots. + */ + if (sctx->clone_roots_cnt == 1) + backref_walk_ctx.skip_data_ref = skip_self_data_ref; + ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs, &backref_ctx); if (ret < 0) From e2a041657774102b184cc41369a53be3d024e8ee Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 1 Nov 2022 16:15:54 +0000 Subject: [PATCH 159/249] btrfs: send: bump the extent reference count limit for backref walking After the previous patchset which is comprised of the following patches: 01/17 btrfs: fix inode list leak during backref walking at resolve_indirect_refs() 02/17 btrfs: fix inode list leak during backref walking at find_parent_nodes() 03/17 btrfs: fix ulist leaks in error paths of qgroup self tests 04/17 btrfs: remove pointless and double ulist frees in error paths of qgroup tests 05/17 btrfs: send: avoid unnecessary path allocations when finding extent clone 06/17 btrfs: send: update comment at find_extent_clone() 07/17 btrfs: send: drop unnecessary backref context field initializations 08/17 btrfs: send: avoid unnecessary backref lookups when finding clone source 09/17 btrfs: send: optimize clone detection to increase extent sharing 10/17 btrfs: use a single argument for extent offset in backref walking functions 11/17 btrfs: use a structure to pass arguments to backref walking functions 12/17 btrfs: reuse roots ulist on each leaf iteration for iterate_extent_inodes() 13/17 btrfs: constify ulist parameter of ulist_next() 14/17 btrfs: send: cache leaf to roots mapping during backref walking 15/17 btrfs: send: skip unnecessary backref iterations 16/17 btrfs: send: avoid double extent tree search when finding clone source 17/17 btrfs: send: skip resolution of our own backref when finding clone source we have now much better performance when doing backref walking in the send code, so we can increase the current limit from 64 to 1024 references. This limit is still a bit conservative because there are still edge cases where backref walking will be too slow and spend a lot of cpu time, some IO reading b+tree nodes/leaves and memory. The goal is to eventually get rid of any limit, but for now bump it as it benefits users with extents shared more than 64 times and up to 1024 times, allowing for more deduplication at the destination without having to run a dedupe tool after a receive. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 383bc8a5cb6c..67f7c698ade3 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -39,7 +39,7 @@ * avoid hitting limitations of the backreference walking code (taking a lot of * time and using too much memory for extents with large number of references). */ -#define SEND_MAX_EXTENT_REFS 64 +#define SEND_MAX_EXTENT_REFS 1024 /* * A fs_path is a helper to dynamically build path names with unknown size. From 9e5e6d4e2e5e37e28bf2be85fa08761e33aa5efb Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:40 +0100 Subject: [PATCH 160/249] btrfs: zlib: use copy_page for full page copy The copy_page helper may use an optimized version for full page copy (eg. on s390 there's a special instruction for that), there's one more left to convert. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/zlib.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b4f44662cda7..c5275cb23837 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -155,8 +155,8 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, in_page = find_get_page(mapping, start >> PAGE_SHIFT); data_in = kmap_local_page(in_page); - memcpy(workspace->buf + i * PAGE_SIZE, - data_in, PAGE_SIZE); + copy_page(workspace->buf + i * PAGE_SIZE, + data_in); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; From fd463ac4616e69c7306c680dabfe857a2f50fc69 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:42 +0100 Subject: [PATCH 161/249] btrfs: zoned: use helper to check a power of two zone size We have a 64bit compatible helper to check if a value is a power of two, use it instead of open coding it. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 13034c3bbb65..95dec93374ea 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -394,8 +394,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_sectors = bdev_zone_sectors(bdev); } - /* Check if it's power of 2 (see is_power_of_2) */ - ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); + ASSERT(is_power_of_two_u64(zone_sectors)); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; /* We reject devices with a zone size larger than 8GB */ From 0d7764ff58b4b45c39eb03f2c74a819c1a88fa7b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:44 +0100 Subject: [PATCH 162/249] btrfs: convert btrfs_block_group::needs_free_space to runtime flag We already have flags in block group to track various status bits, convert needs_free_space as well and reduce size of btrfs_block_group. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/block-group.h | 8 ++------ fs/btrfs/free-space-tree.c | 10 +++++----- fs/btrfs/tests/free-space-tree-tests.c | 2 +- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea5fcb665daa..708d843daa72 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2553,7 +2553,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->global_root_id = calculate_global_root_id(fs_info, cache->start); if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); ret = btrfs_load_block_group_zone_info(cache, true); if (ret) { diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 6c970a486b68..4cee23c11938 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -55,6 +55,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, + /* Does the block group need to be added to the free space tree? */ + BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, }; enum btrfs_caching_type { @@ -208,12 +210,6 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; - /* - * Does the block group need to be added to the free space tree? - * Protected by free_space_lock. - */ - int needs_free_space; - /* Flag indicating this block group is placed on a sequential zone */ bool seq_zone; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 869d062d6765..c667e878ef1a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -808,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -1001,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans, u32 flags; int ret; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { ret = __add_block_group_free_space(trans, block_group, path); if (ret) return ret; @@ -1304,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans, { int ret; - block_group->needs_free_space = 0; + clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags); ret = add_new_free_space_info(trans, block_group, path); if (ret) @@ -1326,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans, return 0; mutex_lock(&block_group->free_space_lock); - if (!block_group->needs_free_space) + if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) goto out; path = btrfs_alloc_path(); @@ -1359,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans, if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE)) return 0; - if (block_group->needs_free_space) { + if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) { /* We never added this block group to the free space tree. */ return 0; } diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c index 53a17b1d1744..b61972046feb 100644 --- a/fs/btrfs/tests/free-space-tree-tests.c +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -471,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize, } cache->bitmap_low_thresh = 0; cache->bitmap_high_thresh = (u32)-1; - cache->needs_free_space = 1; + set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags); cache->fs_info = root->fs_info; btrfs_init_dummy_trans(&trans, root->fs_info); From 961f5b8bf48a463ac5fe5b13143426d79eb41817 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 31 Oct 2022 20:33:46 +0100 Subject: [PATCH 163/249] btrfs: convert btrfs_block_group::seq_zone to runtime flag In zoned mode the sequential status of zone can be also tracked in the runtime flags of block group. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/block-group.h | 5 ++--- fs/btrfs/zoned.c | 7 ++++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 4cee23c11938..a02ea76fd6cf 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -57,6 +57,8 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, /* Does the block group need to be added to the free space tree? */ BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, + /* Indicate that the block group is placed on a sequential zone */ + BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, }; enum btrfs_caching_type { @@ -210,9 +212,6 @@ struct btrfs_block_group { /* Lock for free space tree operations. */ struct mutex free_space_lock; - /* Flag indicating this block group is placed on a sequential zone */ - bool seq_zone; - /* * Number of extents in this block group used for swap files. * All accesses protected by the spinlock 'lock'. diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 95dec93374ea..a759668477bb 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1437,7 +1437,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } if (num_sequential > 0) - cache->seq_zone = true; + set_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); if (num_conventional > 0) { /* Zone capacity is always zone size in emulation */ @@ -1649,7 +1649,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) if (!cache) return false; - ret = cache->seq_zone; + ret = !!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &cache->runtime_flags); btrfs_put_block_group(cache); return ret; @@ -2154,7 +2154,8 @@ static void btrfs_zone_finish_endio_workfn(struct work_struct *work) void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, struct extent_buffer *eb) { - if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + if (!test_bit(BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, &bg->runtime_flags) || + eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) return; if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { From 19af6a7d345acc885f970d57577fa80ca4ad3d98 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:02:32 +0200 Subject: [PATCH 164/249] btrfs: change how repair action is passed to btrfs_repair_one_sector There's a function pointer passed to btrfs_repair_one_sector that will submit the right bio for repair. However there are only two callbacks, for buffered and for direct IO. This can be simplified to a bool-based switch and call either function, indirect calls in this case is an unnecessary abstraction. This allows to remove the submit_bio_hook_t typedef. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +++ fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 14 +++++++++----- fs/btrfs/extent_io.h | 6 +----- fs/btrfs/inode.c | 9 ++++----- 5 files changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index d21c30bf7053..fa0c72cabd8f 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -414,6 +414,9 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index e66fa18dbcf7..cf3dc7e501ec 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -196,7 +196,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) refcount_inc(&cb->pending_ios); ret = btrfs_repair_one_sector(inode, bbio, offset, bv.bv_page, bv.bv_offset, - btrfs_submit_data_read_bio); + true); if (ret) { refcount_dec(&cb->pending_ios); status = errno_to_blk_status(ret); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2ec989b83f54..44ff41304247 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -797,7 +797,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook) + bool submit_buffered) { u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; @@ -856,11 +856,15 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, failrec->this_mirror); /* - * At this point we have a bio, so any errors from submit_bio_hook() - * will be handled by the endio on the repair_bio, so we can't return an + * At this point we have a bio, so any errors from bio submission will + * be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); + if (submit_buffered) + btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); + else + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror, 0); + return BLK_STS_OK; } @@ -951,7 +955,7 @@ static void submit_data_read_repair(struct inode *inode, ret = btrfs_repair_one_sector(inode, failed_bbio, bio_offset + offset, page, pgoff + offset, - btrfs_submit_data_read_bio); + true); if (!ret) { /* * We have submitted the read repair, the page release diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a5ec1475988f..321680f229c6 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,10 +70,6 @@ struct extent_io_tree; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); -typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type); - typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -277,7 +273,7 @@ struct io_failure_record { int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, - submit_bio_hook_t *submit_bio_hook); + bool submit_buffered); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, struct page *page, unsigned int pg_offset); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1422f072a7b8..8cc1283fb347 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7923,9 +7923,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type) +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -7960,8 +7960,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, int ret; ret = btrfs_repair_one_sector(inode, bbio, offset, - bv.bv_page, bv.bv_offset, - submit_dio_repair_bio); + bv.bv_page, bv.bv_offset, false); if (ret) err = errno_to_blk_status(ret); } From 7920b773bd8a458c05b2ba4581fe19c5704fffd7 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:07:10 +0200 Subject: [PATCH 165/249] btrfs: drop parameter compression_type from btrfs_submit_dio_repair_bio Compression and direct io don't work together so the compression parameter can be dropped after previous patch that changed the call to direct. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 4 +--- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 4 +--- 3 files changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index fa0c72cabd8f..62019d7c1cbd 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -414,9 +414,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type); +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 44ff41304247..cc05ae772fa5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -863,7 +863,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, if (submit_buffered) btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror, 0); + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); return BLK_STS_OK; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8cc1283fb347..74ed0afe4c2f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7923,9 +7923,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - enum btrfs_compression_type compress_type) +void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); From ab2072b2921eced166dbdec1d65b0284b6eba2b9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:22:19 +0200 Subject: [PATCH 166/249] btrfs: change how submit bio callback is passed to btrfs_wq_submit_bio There's a callback function parameter for btrfs_wq_submit_bio that can be one of: metadata, buffered data, direct io data. The callback abstraction is unnecessary as we have all functions available. Replace the parameter with a command that leads to a direct call in run_one_async_start. The called functions can be then simplified and we can also remove the extent_submit_bio_start_t typedef. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 5 +++++ fs/btrfs/disk-io.c | 33 +++++++++++++++++++++++---------- fs/btrfs/disk-io.h | 12 ++++++++++-- fs/btrfs/extent_io.h | 3 --- fs/btrfs/inode.c | 15 +++++++-------- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 62019d7c1cbd..72cf235b7beb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -415,6 +415,11 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset); +blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, + struct bio *bio, + u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e5687bdff150..394dfe5ff567 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -86,10 +86,10 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) struct async_submit_bio { struct inode *inode; struct bio *bio; - extent_submit_bio_start_t *submit_bio_start; + enum btrfs_wq_submit_cmd submit_cmd; int mirror_num; - /* Optional parameter for submit_bio_start used by direct io */ + /* Optional parameter for used by direct io */ u64 dio_file_offset; struct btrfs_work work; blk_status_t status; @@ -637,8 +637,22 @@ static void run_one_async_start(struct btrfs_work *work) blk_status_t ret; async = container_of(work, struct async_submit_bio, work); - ret = async->submit_bio_start(async->inode, async->bio, - async->dio_file_offset); + switch (async->submit_cmd) { + case WQ_SUBMIT_METADATA: + ret = btree_submit_bio_start(async->inode, async->bio, + async->dio_file_offset); + break; + case WQ_SUBMIT_DATA: + ret = btrfs_submit_bio_start(async->inode, async->bio, + async->dio_file_offset); + + break; + case WQ_SUBMIT_DATA_DIO: + ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, + async->dio_file_offset); + + break; + } if (ret) async->status = ret; } @@ -689,8 +703,7 @@ static void run_one_async_free(struct btrfs_work *work) * - false in case of error */ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; @@ -702,7 +715,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; - async->submit_bio_start = submit_bio_start; + async->submit_cmd = cmd; btrfs_init_work(&async->work, run_one_async_start, run_one_async_done, run_one_async_free); @@ -736,8 +749,8 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { /* * when we're called for a write, we're already in the async @@ -776,7 +789,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * happen in parallel across all CPUs. */ if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 6edc66b4b4d3..5998b2589830 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -113,9 +113,17 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, int level, struct btrfs_key *first_key); + +enum btrfs_wq_submit_cmd { + WQ_SUBMIT_METADATA, + WQ_SUBMIT_DATA, + WQ_SUBMIT_DATA_DIO, +}; + bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); + u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); +blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 321680f229c6..b3d4b568fe33 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -70,9 +70,6 @@ struct extent_io_tree; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); -typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, - struct bio *bio, u64 dio_file_offset); - #define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE) struct extent_buffer { u64 start; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 74ed0afe4c2f..06931c20ebfa 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,8 +2550,8 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -static blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, + u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } @@ -2758,8 +2758,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(bi->root)) { if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - btrfs_submit_bio_start)) + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); @@ -7967,9 +7966,9 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, - struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, + struct bio *bio, + u64 dio_file_offset) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); } @@ -8017,7 +8016,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, /* Check btrfs_submit_data_write_bio() for async submit rules */ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && btrfs_wq_submit_bio(inode, bio, 0, file_offset, - btrfs_submit_bio_start_direct_io)) + WQ_SUBMIT_DATA_DIO)) return; /* From ad65ecf30b037549a17d2e402fac8d39242073d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:30:45 +0200 Subject: [PATCH 167/249] btrfs: simplify btree_submit_bio_start and btrfs_submit_bio_start parameters After previous patches the unused parameters can be removed from btree_submit_bio_start and btrfs_submit_bio_start as they don't need to conform to the extent_submit_bio_start_t typedef. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 3 +-- fs/btrfs/disk-io.c | 11 +++-------- fs/btrfs/disk-io.h | 3 +-- fs/btrfs/inode.c | 3 +-- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 72cf235b7beb..54bf002e0013 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -415,8 +415,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset); +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio); blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 394dfe5ff567..7bc0fee4da13 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -639,18 +639,14 @@ static void run_one_async_start(struct btrfs_work *work) async = container_of(work, struct async_submit_bio, work); switch (async->submit_cmd) { case WQ_SUBMIT_METADATA: - ret = btree_submit_bio_start(async->inode, async->bio, - async->dio_file_offset); + ret = btree_submit_bio_start(async->bio); break; case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio, - async->dio_file_offset); - + ret = btrfs_submit_bio_start(async->inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, async->dio_file_offset); - break; } if (ret) @@ -749,8 +745,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio) return errno_to_blk_status(ret); } -blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btree_submit_bio_start(struct bio *bio) { /* * when we're called for a write, we're already in the async diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5998b2589830..d5b25fa8037b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -122,8 +122,7 @@ enum btrfs_wq_submit_cmd { bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); -blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset); +blk_status_t btree_submit_bio_start(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06931c20ebfa..cd9325f51ffd 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,8 +2550,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio, - u64 dio_file_offset) +blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio) { return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); } From da67daab8dd679b9a10dee86520c530011342a8d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:40:36 +0200 Subject: [PATCH 168/249] btrfs: switch async_submit_bio::inode to btrfs_inode The async bio submit is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 7bc0fee4da13..44806ccad43c 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -84,7 +84,7 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) * just before they are sent down the IO stack. */ struct async_submit_bio { - struct inode *inode; + struct btrfs_inode *inode; struct bio *bio; enum btrfs_wq_submit_cmd submit_cmd; int mirror_num; @@ -642,11 +642,11 @@ static void run_one_async_start(struct btrfs_work *work) ret = btree_submit_bio_start(async->bio); break; case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(async->inode, async->bio); + ret = btrfs_submit_bio_start(&async->inode->vfs_inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, - async->dio_file_offset); + ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode, + async->bio, async->dio_file_offset); break; } if (ret) @@ -665,7 +665,7 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - struct inode *inode = async->inode; + struct btrfs_inode *inode = async->inode; struct btrfs_bio *bbio = btrfs_bio(async->bio); /* If an error occurred we just want to clean up the bio and move on */ @@ -680,7 +680,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); + btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -708,7 +708,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, if (!async) return false; - async->inode = inode; + async->inode = BTRFS_I(inode); async->bio = bio; async->mirror_num = mirror_num; async->submit_cmd = cmd; From 882681ac98837aa7683a59020c0c98d05479805f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 169/249] btrfs: pass btrfs_inode to btrfs_submit_bio_start The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 54bf002e0013..4ec6a74dd6ba 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -415,7 +415,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio); +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, struct bio *bio, u64 dio_file_offset); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 44806ccad43c..3e42fab48e6b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -642,7 +642,7 @@ static void run_one_async_start(struct btrfs_work *work) ret = btree_submit_bio_start(async->bio); break; case WQ_SUBMIT_DATA: - ret = btrfs_submit_bio_start(&async->inode->vfs_inode, async->bio); + ret = btrfs_submit_bio_start(async->inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cd9325f51ffd..aa55cf6e763e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2550,9 +2550,9 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * At IO completion time the cums attached on the ordered extent record * are inserted into the btree */ -blk_status_t btrfs_submit_bio_start(struct inode *inode, struct bio *bio) +blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + return btrfs_csum_one_bio(inode, bio, (u64)-1, false); } /* From bfa17066822cead4fb8a6a5d4a214a2c527e4614 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 170/249] btrfs: pass btrfs_inode to btrfs_submit_bio_start_direct_io The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4ec6a74dd6ba..a41d4f953bfa 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -416,7 +416,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); -blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, struct bio *bio, u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3e42fab48e6b..5aacb88069a7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -645,7 +645,7 @@ static void run_one_async_start(struct btrfs_work *work) ret = btrfs_submit_bio_start(async->inode, async->bio); break; case WQ_SUBMIT_DATA_DIO: - ret = btrfs_submit_bio_start_direct_io(&async->inode->vfs_inode, + ret = btrfs_submit_bio_start_direct_io(async->inode, async->bio, async->dio_file_offset); break; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aa55cf6e763e..06ec84be4b51 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7965,11 +7965,11 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, return err; } -blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, +blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, struct bio *bio, u64 dio_file_offset) { - return btrfs_csum_one_bio(BTRFS_I(inode), bio, dio_file_offset, false); + return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); } static void btrfs_end_dio_bio(struct btrfs_bio *bbio) From 5fcdadc2704534ef3a7b99aef65d54183c723493 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 171/249] btrfs: pass btrfs_inode to btrfs_wq_submit_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/disk-io.h | 2 +- fs/btrfs/inode.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5aacb88069a7..65c168d6ed77 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -698,17 +698,17 @@ static void run_one_async_free(struct btrfs_work *work) * - true if the work has been succesfuly submitted * - false in case of error */ -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) return false; - async->inode = BTRFS_I(inode); + async->inode = inode; async->bio = bio; async->mirror_num = mirror_num; async->submit_cmd = cmd; @@ -784,7 +784,7 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * happen in parallel across all CPUs. */ if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) + btrfs_wq_submit_bio(BTRFS_I(inode), bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index d5b25fa8037b..65cf976b74e8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -120,7 +120,7 @@ enum btrfs_wq_submit_cmd { WQ_SUBMIT_DATA_DIO, }; -bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, +bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd); blk_status_t btree_submit_bio_start(struct bio *bio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06ec84be4b51..a7cbbfc22329 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2757,7 +2757,7 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && !btrfs_is_data_reloc_root(bi->root)) { if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) + btrfs_wq_submit_bio(bi, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); @@ -8014,7 +8014,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && - btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_wq_submit_bio(BTRFS_I(inode), bio, 0, file_offset, WQ_SUBMIT_DATA_DIO)) return; From 644094fd285499b484e50a8e986965b550a0924e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 172/249] btrfs: pass btrfs_inode to btrfs_submit_metadata_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/disk-io.h | 2 +- fs/btrfs/extent_io.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 65c168d6ed77..72e3446d9840 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -766,9 +766,9 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_bio *bbio = btrfs_bio(bio); blk_status_t ret; @@ -783,8 +783,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_ * Kthread helpers are used to submit writes so that checksumming can * happen in parallel across all CPUs. */ - if (should_async_write(fs_info, BTRFS_I(inode)) && - btrfs_wq_submit_bio(BTRFS_I(inode), bio, mirror_num, 0, WQ_SUBMIT_METADATA)) + if (should_async_write(fs_info, inode) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA)) return; ret = btree_csum_one_bio(bio); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 65cf976b74e8..f2c507fd0e04 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -86,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index cc05ae772fa5..e770cbc5cb6a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -134,7 +134,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; if (!is_data_inode(inode)) - btrfs_submit_metadata_bio(inode, bio, mirror_num); + btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(inode, bio, mirror_num); else From 535a7e5d6b7e04da0336c632c834a575e82b5082 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 173/249] btrfs: pass btrfs_inode to btrfs_submit_data_write_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/inode.c | 17 ++++++++--------- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a41d4f953bfa..01fc62d39ed2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -411,7 +411,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e770cbc5cb6a..13fba51be32d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -136,7 +136,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) if (!is_data_inode(inode)) btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_submit_data_write_bio(inode, bio, mirror_num); + btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num); else btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a7cbbfc22329..538f55088bdf 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2730,14 +2730,13 @@ out: return errno_to_blk_status(ret); } -void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *bi = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - ret = extract_ordered_extent(bi, bio, + ret = extract_ordered_extent(inode, bio, page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); @@ -2753,14 +2752,14 @@ void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirro * Csum items for reloc roots have already been cloned at this point, * so they are handled as part of the no-checksum case. */ - if (!(bi->flags & BTRFS_INODE_NODATASUM) && + if (!(inode->flags & BTRFS_INODE_NODATASUM) && !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && - !btrfs_is_data_reloc_root(bi->root)) { - if (!atomic_read(&bi->sync_writers) && - btrfs_wq_submit_bio(bi, bio, mirror_num, 0, WQ_SUBMIT_DATA)) + !btrfs_is_data_reloc_root(inode->root)) { + if (!atomic_read(&inode->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_DATA)) return; - ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); + ret = btrfs_csum_one_bio(inode, bio, (u64)-1, false); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; From b762041629e73d8e24a10aae4bf256774fbbd082 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 174/249] btrfs: pass btrfs_inode to btrfs_submit_data_read_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 5 +++-- fs/btrfs/inode.c | 8 ++++---- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 01fc62d39ed2..5406a2f817b2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -412,7 +412,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, #define CSUM_FMT_VALUE(size, bytes) size, bytes void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 13fba51be32d..ac69e5f1605c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -138,7 +138,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) else if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num); else - btrfs_submit_data_read_bio(inode, bio, mirror_num, + btrfs_submit_data_read_bio(BTRFS_I(inode), bio, mirror_num, bio_ctrl->compress_type); /* The bio is owned by the end_io handler now */ @@ -861,7 +861,8 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * error here. */ if (submit_buffered) - btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); + btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio, + failrec->this_mirror, 0); else btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 538f55088bdf..e3bbd19edea3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2768,10 +2768,10 @@ void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int btrfs_submit_bio(fs_info, bio, mirror_num); } -void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, +void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; if (compress_type != BTRFS_COMPRESS_NONE) { @@ -2779,7 +2779,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * btrfs_submit_compressed_read will handle completing the bio * if there were any errors, so just return here. */ - btrfs_submit_compressed_read(inode, bio, mirror_num); + btrfs_submit_compressed_read(&inode->vfs_inode, bio, mirror_num); return; } @@ -2790,7 +2790,7 @@ void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, * Lookup bio sums does extra checks around whether we need to csum or * not, which is why we ignore skip_sum here. */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(&inode->vfs_inode, bio, NULL); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; From d781c1c315ce649002a9fd0387edc3ee93271743 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 175/249] btrfs: pass btrfs_inode to btrfs_submit_dio_repair_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 3 ++- fs/btrfs/inode.c | 5 ++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5406a2f817b2..9fe1a11a2eb3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -414,7 +414,7 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags, void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num, enum btrfs_compression_type compress_type); -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num); blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio); blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, struct bio *bio, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index ac69e5f1605c..bddda397e438 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -864,7 +864,8 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio, failrec->this_mirror, 0); else - btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); + btrfs_submit_dio_repair_bio(BTRFS_I(inode), repair_bio, + failrec->this_mirror); return BLK_STS_OK; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e3bbd19edea3..f0b4f3a9245f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7920,15 +7920,14 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) bio_endio(&dip->bio); } -void btrfs_submit_dio_repair_bio(struct inode *inode, struct bio *bio, int mirror_num) +void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); BUG_ON(bio_op(bio) == REQ_OP_WRITE); refcount_inc(&dip->refs); - btrfs_submit_bio(fs_info, bio, mirror_num); + btrfs_submit_bio(inode->root->fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, From c5ca391b0dd82d4c0457159286bd19f2364a70be Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 176/249] btrfs: pass btrfs_inode to submit_one_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bddda397e438..192f604ac7ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -117,7 +117,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { struct bio *bio; struct bio_vec *bv; - struct inode *inode; + struct btrfs_inode *inode; int mirror_num; if (!bio_ctrl->bio) @@ -125,7 +125,7 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) bio = bio_ctrl->bio; bv = bio_first_bvec_all(bio); - inode = bv->bv_page->mapping->host; + inode = BTRFS_I(bv->bv_page->mapping->host); mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ @@ -133,12 +133,12 @@ static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; - if (!is_data_inode(inode)) - btrfs_submit_metadata_bio(BTRFS_I(inode), bio, mirror_num); + if (!is_data_inode(&inode->vfs_inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); else if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_submit_data_write_bio(BTRFS_I(inode), bio, mirror_num); + btrfs_submit_data_write_bio(inode, bio, mirror_num); else - btrfs_submit_data_read_bio(BTRFS_I(inode), bio, mirror_num, + btrfs_submit_data_read_bio(inode, bio, mirror_num, bio_ctrl->compress_type); /* The bio is owned by the end_io handler now */ From d8f9268ece91e6fe887b444780787828890a8051 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 177/249] btrfs: pass btrfs_inode to btrfs_repair_one_sector The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/compression.c | 2 +- fs/btrfs/extent_io.c | 17 ++++++++--------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index cf3dc7e501ec..6f5ad0d6c409 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -194,7 +194,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) int ret; refcount_inc(&cb->pending_ios); - ret = btrfs_repair_one_sector(inode, bbio, offset, + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, bv.bv_page, bv.bv_offset, true); if (ret) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 192f604ac7ea..05768f7f7872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -795,13 +795,13 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, bool submit_buffered) { u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; @@ -812,7 +812,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); + failrec = btrfs_get_io_failure_record(&inode->vfs_inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); @@ -830,7 +830,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, btrfs_debug(fs_info, "failed to repair num_copies %d this_mirror %d failed_mirror %d", failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); - free_io_failure(BTRFS_I(inode), failrec); + free_io_failure(inode, failrec); return -EIO; } @@ -851,7 +851,7 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, bio_add_page(repair_bio, page, failrec->len, pgoff); repair_bbio->iter = repair_bio->bi_iter; - btrfs_debug(btrfs_sb(inode->i_sb), + btrfs_debug(fs_info, "repair read error: submitting new read to mirror %d", failrec->this_mirror); @@ -861,11 +861,10 @@ int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, * error here. */ if (submit_buffered) - btrfs_submit_data_read_bio(BTRFS_I(inode), repair_bio, + btrfs_submit_data_read_bio(inode, repair_bio, failrec->this_mirror, 0); else - btrfs_submit_dio_repair_bio(BTRFS_I(inode), repair_bio, - failrec->this_mirror); + btrfs_submit_dio_repair_bio(inode, repair_bio, failrec->this_mirror); return BLK_STS_OK; } @@ -955,7 +954,7 @@ static void submit_data_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bbio, + ret = btrfs_repair_one_sector(BTRFS_I(inode), failed_bbio, bio_offset + offset, page, pgoff + offset, true); if (!ret) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index b3d4b568fe33..805e262811b4 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -268,7 +268,7 @@ struct io_failure_record { int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, +int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio, u32 bio_offset, struct page *page, unsigned int pgoff, bool submit_buffered); void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f0b4f3a9245f..4a2fec637714 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7953,7 +7953,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, } else { int ret; - ret = btrfs_repair_one_sector(inode, bbio, offset, + ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset, bv.bv_page, bv.bv_offset, false); if (ret) err = errno_to_blk_status(ret); From e2884c3d445669c231fd8ca4c6fd160e729ec862 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 178/249] btrfs: switch btrfs_dio_private::inode to btrfs_inode The btrfs_dio_private structure is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4a2fec637714..02100bb9c2a2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -84,7 +84,7 @@ struct btrfs_dio_data { }; struct btrfs_dio_private { - struct inode *inode; + struct btrfs_inode *inode; /* * Since DIO can use anonymous page, we cannot use page_offset() to @@ -7907,11 +7907,11 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) return; if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { - btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + btrfs_mark_ordered_io_finished(dip->inode, NULL, dip->file_offset, dip->bytes, !dip->bio.bi_status); } else { - unlock_extent(&BTRFS_I(dip->inode)->io_tree, + unlock_extent(&dip->inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); } @@ -7934,7 +7934,7 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, struct btrfs_bio *bbio, const bool uptodate) { - struct inode *inode = dip->inode; + struct inode *inode = &dip->inode->vfs_inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); blk_status_t err = BLK_STS_OK; @@ -7977,9 +7977,9 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) blk_status_t err = bio->bi_status; if (err) - btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + btrfs_warn(dip->inode->root->fs_info, "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d", - btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio), + btrfs_ino(dip->inode), bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, err); @@ -7989,7 +7989,7 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) if (err) dip->bio.bi_status = err; - btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); + btrfs_record_physical_zoned(&dip->inode->vfs_inode, bbio->file_offset, bio); bio_put(bio); btrfs_dio_private_put(dip); @@ -8056,7 +8056,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip->inode = inode; + dip->inode = BTRFS_I(inode); dip->file_offset = file_offset; dip->bytes = dio_bio->bi_iter.bi_size; refcount_set(&dip->refs, 1); From bb41632ea7d252855f7a12f269afefc18a84e07b Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 179/249] btrfs: pass btrfs_inode to btrfs_submit_dio_bio The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 02100bb9c2a2..b1369f645f1b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7995,10 +7995,10 @@ static void btrfs_end_dio_bio(struct btrfs_bio *bbio) btrfs_dio_private_put(dip); } -static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, +static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, u64 file_offset, int async_submit) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_dio_private *dip = btrfs_bio(bio)->private; blk_status_t ret; @@ -8006,13 +8006,13 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, if (btrfs_op(bio) == BTRFS_MAP_READ) btrfs_bio(bio)->iter = bio->bi_iter; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) goto map; if (btrfs_op(bio) == BTRFS_MAP_WRITE) { /* Check btrfs_submit_data_write_bio() for async submit rules */ - if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && - btrfs_wq_submit_bio(BTRFS_I(inode), bio, 0, file_offset, + if (async_submit && !atomic_read(&inode->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, WQ_SUBMIT_DATA_DIO)) return; @@ -8020,7 +8020,7 @@ static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, * If we aren't doing async submit, calculate the csum of the * bio now. */ - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); + ret = btrfs_csum_one_bio(inode, bio, file_offset, false); if (ret) { btrfs_bio_end_io(btrfs_bio(bio), ret); return; @@ -8142,7 +8142,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); + btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; From d9dcae67b7fe1f8c7d974a7a002c81d9b02b7281 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 180/249] btrfs: pass btrfs_inode to btrfs_truncate The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1369f645f1b..f612c8b9d4bc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -124,7 +124,7 @@ static const struct file_operations btrfs_dir_file_operations; static struct kmem_cache *btrfs_inode_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); -static int btrfs_truncate(struct inode *inode, bool skip_writeback); +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, @@ -5270,7 +5270,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) inode_dio_wait(inode); - ret = btrfs_truncate(inode, newsize == oldsize); + ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize); if (ret && inode->i_nlink) { int err; @@ -8632,16 +8632,16 @@ out_noreserve: return ret; } -static int btrfs_truncate(struct inode *inode, bool skip_writeback) +static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) { struct btrfs_truncate_control control = { - .inode = BTRFS_I(inode), - .ino = btrfs_ino(BTRFS_I(inode)), + .inode = inode, + .ino = btrfs_ino(inode), .min_type = BTRFS_EXTENT_DATA_KEY, .clear_extent_range = true, }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *rsv; int ret; struct btrfs_trans_handle *trans; @@ -8649,7 +8649,8 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) u64 min_size = btrfs_calc_metadata_size(fs_info, 1); if (!skip_writeback) { - ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask), + ret = btrfs_wait_ordered_range(&inode->vfs_inode, + inode->vfs_inode.i_size & (~mask), (u64)-1); if (ret) return ret; @@ -8708,34 +8709,32 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) while (1) { struct extent_state *cached_state = NULL; - const u64 new_size = inode->i_size; + const u64 new_size = inode->vfs_inode.i_size; const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last * block of the extent just the way it is. */ - btrfs_drop_extent_map_range(BTRFS_I(inode), + btrfs_drop_extent_map_range(inode, ALIGN(new_size, fs_info->sectorsize), (u64)-1, false); ret = btrfs_truncate_inode_items(trans, root, &control); - inode_sub_bytes(inode, control.sub_bytes); - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), control.last_size); + inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); + btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1, - &cached_state); + unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) break; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -8766,7 +8765,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -8774,14 +8773,14 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) ret = PTR_ERR(trans); goto out; } - btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0); + btrfs_inode_safe_disk_i_size_write(inode, 0); } if (trans) { int ret2; trans->block_rsv = &fs_info->trans_block_rsv; - ret2 = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret2 = btrfs_update_inode(trans, root, inode); if (ret2 && !ret) ret = ret2; @@ -8807,7 +8806,7 @@ out: * extents beyond i_size to drop. */ if (control.extents_found > 0) - btrfs_set_inode_full_sync(BTRFS_I(inode)); + btrfs_set_inode_full_sync(inode); return ret; } From 29b6352b1494bd7ec6a14bda087f8eb858d2fc1f Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 181/249] btrfs: pass btrfs_inode to btrfs_inode_lock The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/defrag.c | 4 ++-- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/file.c | 16 ++++++++-------- fs/btrfs/inode.c | 14 +++++++------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/reflink.c | 2 +- fs/btrfs/relocation.c | 2 +- 8 files changed, 22 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9fe1a11a2eb3..a06d1c0a0cc2 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -545,7 +545,7 @@ enum btrfs_ilock_type { ENUM_BIT(BTRFS_ILOCK_MMAP), }; -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 919dfe0f7e50..6aade4838927 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1295,7 +1295,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; btrfs_inode_unlock(inode, 0); @@ -1351,7 +1351,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; btrfs_inode_unlock(inode, 0); } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index c024f97de9e0..4edf44d8cd9e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1647,7 +1647,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * item->readdir_list. */ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f8be9d629e75..b3b7b276cce0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1193,7 +1193,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, if (nowait) ilock_flags |= BTRFS_ILOCK_TRY; - ret = btrfs_inode_lock(inode, ilock_flags); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (ret < 0) return ret; @@ -1468,7 +1468,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) ilock_flags |= BTRFS_ILOCK_SHARED; relock: - err = btrfs_inode_lock(inode, ilock_flags); + err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); if (err < 0) return err; @@ -1616,7 +1616,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, loff_t count; ssize_t ret; - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); count = encoded->len; ret = generic_write_checks_count(iocb, &count); if (ret == 0 && count != encoded->len) { @@ -1806,7 +1806,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -2596,7 +2596,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) @@ -3054,7 +3054,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3691,7 +3691,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); case SEEK_DATA: case SEEK_HOLE: - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); offset = find_desired_extent(BTRFS_I(inode), offset, whence); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; @@ -3748,7 +3748,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to) if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos)) return 0; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); again: /* * This is similar to what we do for direct IO writes, see the comment diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f612c8b9d4bc..9b0f84c85bef 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -172,27 +172,27 @@ static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode, * return -EAGAIN * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ -int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) +int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_SHARED) { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock_shared(inode)) + if (!inode_trylock_shared(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock_shared(inode); + inode_lock_shared(&inode->vfs_inode); } else { if (ilock_flags & BTRFS_ILOCK_TRY) { - if (!inode_trylock(inode)) + if (!inode_trylock(&inode->vfs_inode)) return -EAGAIN; else return 0; } - inode_lock(inode); + inode_lock(&inode->vfs_inode); } if (ilock_flags & BTRFS_ILOCK_MMAP) - down_write(&BTRFS_I(inode)->i_mmap_lock); + down_write(&inode->i_mmap_lock); return 0; } @@ -10529,7 +10529,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, file_accessed(iocb->ki_filp); - btrfs_inode_lock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6f9f8e8dfea6..a912e1cb283d 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2528,7 +2528,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(BTRFS_I(inode), 0); err = btrfs_delete_subvolume(dir, dentry); btrfs_inode_unlock(inode, 0); if (!err) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 9d728107536e..c62c7fdd55d9 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -893,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { lock_two_nondirectories(src_inode, dst_inode); btrfs_double_mmap_lock(src_inode, dst_inode); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 2ecca24e1001..f34cb92f3151 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2874,7 +2874,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) return ret; - btrfs_inode_lock(&inode->vfs_inode, 0); + btrfs_inode_lock(inode, 0); for (nr = 0; nr < cluster->nr; nr++) { struct extent_state *cached_state = NULL; From e5d4d75bd3241d0a5990060ef9601bf17adf9bb5 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 182/249] btrfs: pass btrfs_inode to btrfs_inode_unlock The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/defrag.c | 8 ++++---- fs/btrfs/delayed-inode.c | 2 +- fs/btrfs/file.c | 32 ++++++++++++++++---------------- fs/btrfs/inode.c | 18 +++++++++--------- fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/reflink.c | 2 +- fs/btrfs/relocation.c | 2 +- 8 files changed, 36 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a06d1c0a0cc2..279bbbc59dc6 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -546,7 +546,7 @@ enum btrfs_ilock_type { }; int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags); -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 6aade4838927..0a3c261b69c9 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -1298,11 +1298,11 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, btrfs_inode_lock(BTRFS_I(inode), 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); break; } if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); break; } if (do_compress) @@ -1315,7 +1315,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (sectors_defragged > prev_sectors_defragged) balance_dirty_pages_ratelimited(inode->i_mapping); - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1353,7 +1353,7 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, if (do_compress) { btrfs_inode_lock(BTRFS_I(inode), 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); } return ret; } diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 4edf44d8cd9e..0095c6e4c3d1 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1646,7 +1646,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); btrfs_inode_lock(BTRFS_I(inode), 0); mutex_lock(&delayed_node->mutex); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b3b7b276cce0..e6c93be91a06 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1428,7 +1428,7 @@ again: iocb->ki_pos += num_written; } out: - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return num_written ? num_written : ret; } @@ -1474,13 +1474,13 @@ relock: err = generic_write_checks(iocb, from); if (err <= 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); return err; } err = btrfs_write_check(iocb, from, err); if (err < 0) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto out; } @@ -1491,13 +1491,13 @@ relock: */ if ((ilock_flags & BTRFS_ILOCK_SHARED) && pos + iov_iter_count(from) > i_size_read(inode)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); ilock_flags &= ~BTRFS_ILOCK_SHARED; goto relock; } if (check_direct_IO(fs_info, from, pos)) { - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } @@ -1528,7 +1528,7 @@ relock: * iocb, and that needs to lock the inode. So unlock it before calling * iomap_dio_complete() to avoid a deadlock. */ - btrfs_inode_unlock(inode, ilock_flags); + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); if (IS_ERR_OR_NULL(dio)) err = PTR_ERR_OR_ZERO(dio); @@ -1635,7 +1635,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_do_encoded_write(iocb, from, encoded); out: - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); return ret; } @@ -1830,7 +1830,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -1933,7 +1933,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); if (ret == BTRFS_NO_LOG_SYNC) { ret = btrfs_end_transaction(trans); @@ -2001,7 +2001,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); goto out; } @@ -2644,7 +2644,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } } @@ -2743,7 +2743,7 @@ out_only_mutex: ret = ret2; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3104,7 +3104,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; } @@ -3202,7 +3202,7 @@ out_unlock: unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: - btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); return ret; } @@ -3693,7 +3693,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_HOLE: btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); offset = find_desired_extent(BTRFS_I(inode), offset, whence); - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } @@ -3797,7 +3797,7 @@ again: goto again; } } - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); return ret < 0 ? ret : read; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9b0f84c85bef..60cc23997216 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -202,14 +202,14 @@ int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags) * ilock_flags should contain the same bits set as passed to btrfs_inode_lock() * to decide whether the lock acquired is shared or exclusive. */ -void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) +void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags) { if (ilock_flags & BTRFS_ILOCK_MMAP) - up_write(&BTRFS_I(inode)->i_mmap_lock); + up_write(&inode->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) - inode_unlock_shared(inode); + inode_unlock_shared(&inode->vfs_inode); else - inode_unlock(inode); + inode_unlock(&inode->vfs_inode); } /* @@ -10277,7 +10277,7 @@ static ssize_t btrfs_encoded_read_inline( read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); unlock_extent(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; ret = copy_to_iter(tmp, count, iter); @@ -10480,7 +10480,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, goto out; unlock_extent(io_tree, start, lockend, cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; if (compressed) { @@ -10532,7 +10532,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); if (iocb->ki_pos >= inode->vfs_inode.i_size) { - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return 0; } start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize); @@ -10630,7 +10630,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (disk_bytenr == EXTENT_MAP_HOLE) { unlock_extent(io_tree, start, lockend, &cached_state); - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); if (ret != count) @@ -10653,7 +10653,7 @@ out_unlock_extent: unlock_extent(io_tree, start, lockend, &cached_state); out_unlock_inode: if (!unlocked) - btrfs_inode_unlock(&inode->vfs_inode, BTRFS_ILOCK_SHARED); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a912e1cb283d..1832d3011d12 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1002,7 +1002,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); return error; } @@ -2530,14 +2530,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, btrfs_inode_lock(BTRFS_I(inode), 0); err = btrfs_delete_subvolume(dir, dentry); - btrfs_inode_unlock(inode, 0); + btrfs_inode_unlock(BTRFS_I(inode), 0); if (!err) d_delete_notify(dir, dentry); out_dput: dput(dentry); out_unlock_dir: - btrfs_inode_unlock(dir, 0); + btrfs_inode_unlock(BTRFS_I(dir), 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index c62c7fdd55d9..0474bbe39da7 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -911,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, out_unlock: if (same_inode) { - btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f34cb92f3151..d75b18e84285 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2894,7 +2894,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) break; } - btrfs_inode_unlock(&inode->vfs_inode, 0); + btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, From 7152b425da54b6aa6ddfb0cbc0603614cc2400bd Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 183/249] btrfs: pass btrfs_inode to btrfs_dirty_inode The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 60cc23997216..dc06511c359a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -286,7 +286,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } -static int btrfs_dirty_inode(struct inode *inode); +static int btrfs_dirty_inode(struct btrfs_inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct btrfs_new_inode_args *args) @@ -5313,7 +5313,7 @@ static int btrfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentr if (attr->ia_valid) { setattr_copy(mnt_userns, inode, attr); inode_inc_iversion(inode); - err = btrfs_dirty_inode(inode); + err = btrfs_dirty_inode(BTRFS_I(inode)); if (!err && attr->ia_valid & ATTR_MODE) err = posix_acl_chmod(mnt_userns, inode, inode->i_mode); @@ -6144,21 +6144,21 @@ err: * FIXME, needs more benchmarking...there are no reasons other than performance * to keep or drop this code. */ -static int btrfs_dirty_inode(struct inode *inode) +static int btrfs_dirty_inode(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; int ret; - if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags)) + if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags)) return 0; trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret && (ret == -ENOSPC || ret == -EDQUOT)) { /* whoops, lets try again with the full transaction */ btrfs_end_transaction(trans); @@ -6166,10 +6166,10 @@ static int btrfs_dirty_inode(struct inode *inode) if (IS_ERR(trans)) return PTR_ERR(trans); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); } btrfs_end_transaction(trans); - if (BTRFS_I(inode)->delayed_node) + if (inode->delayed_node) btrfs_balance_delayed_items(fs_info); return ret; @@ -6196,7 +6196,7 @@ static int btrfs_update_time(struct inode *inode, struct timespec64 *now, inode->i_mtime = *now; if (flags & S_ATIME) inode->i_atime = *now; - return dirty ? btrfs_dirty_inode(inode) : 0; + return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0; } /* From 82ca5a04f03fa1e74a5034ec61ac86d9c817a2b9 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 184/249] btrfs: pass btrfs_inode to btrfs_add_delalloc_inodes The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dc06511c359a..d9d7e11d8d9a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2369,16 +2369,14 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; spin_lock(&root->delalloc_lock); - if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { - list_add_tail(&BTRFS_I(inode)->delalloc_inodes, - &root->delalloc_inodes); - set_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags); + if (list_empty(&inode->delalloc_inodes)) { + list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes); + set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags); root->nr_delalloc_inodes++; if (root->nr_delalloc_inodes == 1) { spin_lock(&fs_info->delalloc_root_lock); @@ -2391,7 +2389,6 @@ static void btrfs_add_delalloc_inodes(struct btrfs_root *root, spin_unlock(&root->delalloc_lock); } - void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode) { @@ -2458,7 +2455,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) - btrfs_add_delalloc_inodes(root, inode); + btrfs_add_delalloc_inodes(root, BTRFS_I(inode)); spin_unlock(&BTRFS_I(inode)->lock); } From 36eeaef5595dc09f1dc4c859d96ffb3cd69dee55 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 02:22:15 +0200 Subject: [PATCH 185/249] btrfs: switch btrfs_writepage_fixup::inode to btrfs_inode The btrfs_writepage_fixup structure is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d9d7e11d8d9a..06ea43460e90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2890,7 +2890,7 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_work work; }; @@ -2909,7 +2909,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; - inode = BTRFS_I(fixup->inode); + inode = fixup->inode; page_start = page_offset(page); page_end = page_offset(page) + PAGE_SIZE - 1; @@ -3068,7 +3068,7 @@ int btrfs_writepage_cow_fixup(struct page *page) get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; - fixup->inode = inode; + fixup->inode = BTRFS_I(inode); btrfs_queue_work(fs_info->fixup_workers, &fixup->work); return -EAGAIN; From 621af94af3342c9a8c3df34b4231d7707accd00e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 186/249] btrfs: pass btrfs_inode to btrfs_check_data_csum The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 4 +--- fs/btrfs/compression.c | 2 +- fs/btrfs/inode.c | 15 +++++++-------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 279bbbc59dc6..ba0dbdc91ec5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -421,13 +421,11 @@ blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, u64 dio_file_offset); int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, u32 pgoff, u8 *csum, const u8 * const csum_expected); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 6f5ad0d6c409..42e6dde2ad59 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -186,7 +186,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio) u64 start = bbio->file_offset + offset; if (!status && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, + (!csum || !btrfs_check_data_csum(bi, bbio, offset, bv.bv_page, bv.bv_offset))) { btrfs_clean_io_failure(bi, start, bv.bv_page, bv.bv_offset); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 06ea43460e90..76e2bea2270a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3490,10 +3490,10 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of * When csum mismatch is detected, we will also report the error and fill the * corrupted range with zero. (Thus it needs the extra parameters) */ -int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, +int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u32 pgoff) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u32 len = fs_info->sectorsize; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; @@ -3507,8 +3507,7 @@ int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), - bbio->file_offset + bio_offset, + btrfs_print_data_csum_error(inode, bbio->file_offset + bio_offset, csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, @@ -3573,7 +3572,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); + ret = btrfs_check_data_csum(BTRFS_I(inode), bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -7943,8 +7942,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, u64 start = bbio->file_offset + offset; if (uptodate && - (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, - bv.bv_offset))) { + (!csum || !btrfs_check_data_csum(BTRFS_I(inode), bbio, offset, + bv.bv_page, bv.bv_offset))) { btrfs_clean_io_failure(BTRFS_I(inode), start, bv.bv_page, bv.bv_offset); } else { @@ -10334,7 +10333,7 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + if (btrfs_check_data_csum(inode, bbio, bio_offset, bvec->bv_page, pgoff)) return BLK_STS_IOERR; bio_offset += sectorsize; From e569b1d545511492d5fc1a062ef8b63fd1b78d84 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 187/249] btrfs: pass btrfs_inode to __unlink_start_trans The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 76e2bea2270a..ffb42560acd5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4415,9 +4415,9 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, * plenty of slack room in the global reserve to migrate, otherwise we cannot * allow the unlink to occur. */ -static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) +static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; /* * 1 for the possible orphan item @@ -4443,7 +4443,7 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(dir); + trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto fscrypt_free; @@ -4857,7 +4857,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) /* This needs to handle no-key deletions later on */ - trans = __unlink_start_trans(dir); + trans = __unlink_start_trans(BTRFS_I(dir)); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out_notrans; From 3c4f91e23a87a486426db5e481ed315b1b2640f1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 188/249] btrfs: pass btrfs_inode to btrfs_delete_subvolume The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/inode.c | 10 +++++----- fs/btrfs/ioctl.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ba0dbdc91ec5..481c75c47fc4 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -439,7 +439,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans, int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry); +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, int front); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ffb42560acd5..5e2fd24b6951 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4698,10 +4698,10 @@ again: spin_unlock(&root->inode_lock); } -int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) +int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct inode *inode = d_inode(dentry); struct btrfs_root *dest = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; @@ -4758,9 +4758,9 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; - btrfs_record_snapshot_destroy(trans, BTRFS_I(dir)); + btrfs_record_snapshot_destroy(trans, dir); - ret = btrfs_unlink_subvol(trans, dir, dentry); + ret = btrfs_unlink_subvol(trans, &dir->vfs_inode, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4848,7 +4848,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) "extent tree v2 doesn't support snapshot deletion yet"); return -EOPNOTSUPP; } - return btrfs_delete_subvolume(dir, dentry); + return btrfs_delete_subvolume(BTRFS_I(dir), dentry); } err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 1832d3011d12..7a9e697d9931 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2529,7 +2529,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, } btrfs_inode_lock(BTRFS_I(inode), 0); - err = btrfs_delete_subvolume(dir, dentry); + err = btrfs_delete_subvolume(BTRFS_I(dir), dentry); btrfs_inode_unlock(BTRFS_I(inode), 0); if (!err) d_delete_notify(dir, dentry); From 35da5a7edec32935135737608c9d9da24a419bab Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 02:47:06 +0200 Subject: [PATCH 189/249] btrfs: drop private_data parameter from extent_io_tree_init All callers except one pass NULL, so the parameter can be dropped and the inode::io_tree initialization can be open coded. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/extent-io-tree.c | 5 ++--- fs/btrfs/extent-io-tree.h | 3 +-- fs/btrfs/inode.c | 5 +++-- fs/btrfs/relocation.c | 3 +-- fs/btrfs/tests/btrfs-tests.c | 2 +- fs/btrfs/tests/extent-io-tests.c | 4 ++-- fs/btrfs/transaction.c | 4 ++-- fs/btrfs/volumes.c | 3 +-- 9 files changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 72e3446d9840..e602e0b4c25d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1044,9 +1044,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->anon_dev = 0; if (!dummy) { extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL); + IO_TREE_ROOT_DIRTY_LOG_PAGES); extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE, NULL); + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -2250,7 +2250,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO, NULL); + IO_TREE_BTREE_INODE_IO); extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); @@ -3074,7 +3074,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS, NULL); + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 2cdff74ff033..81365870576f 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -94,13 +94,12 @@ struct tree_entry { }; void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data) + struct extent_io_tree *tree, unsigned int owner) { tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->private_data = private_data; + tree->private_data = NULL; tree->owner = owner; if (owner == IO_TREE_INODE_FILE_EXTENT) lockdep_set_class(&tree->lock, &file_extent_tree_class); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index d73ef24bad2e..0e16642c28a3 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -104,8 +104,7 @@ struct extent_state { }; void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner, - void *private_data); + struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e2fd24b6951..29c6c0ec0016 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8871,9 +8871,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO, inode); + extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + ei->io_tree.private_data = inode; extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT, NULL); + IO_TREE_INODE_FILE_EXTENT); ei->io_failure_tree = RB_ROOT; atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d75b18e84285..1440cb332a5a 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3928,8 +3928,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&rc->dirty_subvol_roots); btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1); mapping_tree_init(&rc->reloc_root_tree); - extent_io_tree_init(fs_info, &rc->processed_blocks, - IO_TREE_RELOC_BLOCKS, NULL); + extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 669fa7133a2f..181469fc0bb3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(NULL, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 350da449db08..dfc5c7fa6038 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -132,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -489,7 +489,7 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2e2dd2ea109b..b8c52e89688c 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -378,9 +378,9 @@ loop: spin_lock_init(&cur_trans->releasing_ebs_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES, NULL); + IO_TREE_TRANS_DIRTY_PAGES); extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS, NULL); + IO_TREE_FS_PINNED_EXTENTS); fs_info->generation++; cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7751ab620761..acb97494bb52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7048,8 +7048,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, - IO_TREE_DEVICE_ALLOC_STATE, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; From 0988fc7bda79feff046056f032fd149ab08e03d1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 02:55:51 +0200 Subject: [PATCH 190/249] btrfs: switch extent_io_tree::private_data to btrfs_inode and rename The extent_io_tree::private_data was meant to be a preparatory work for the metadata inode rework but that never materialized. Now it's used only for an inode so it's better to change the appropriate type and rename it. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 32 ++++++++++++++++---------------- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/inode.c | 2 +- include/trace/events/btrfs.h | 27 ++++++++++++--------------- 4 files changed, 31 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 81365870576f..bbcc65593d1d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -58,17 +58,17 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct inode *inode = tree->private_data; + struct btrfs_inode *inode = tree->inode; u64 isize; if (!inode) return; - isize = i_size_read(inode); + isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { - btrfs_debug_rl(BTRFS_I(inode)->root->fs_info, + btrfs_debug_rl(inode->root->fs_info, "%s: ino %llu isize %llu odd range [%llu,%llu]", - caller, btrfs_ino(BTRFS_I(inode)), isize, start, end); + caller, btrfs_ino(inode), isize, start, end); } } #else @@ -99,7 +99,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->private_data = NULL; + tree->inode = NULL; tree->owner = owner; if (owner == IO_TREE_INODE_FILE_EXTENT) lockdep_set_class(&tree->lock, &file_extent_tree_class); @@ -346,9 +346,9 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) other = prev_state(state); if (other && other->end == state->start - 1 && other->state == state->state) { - if (tree->private_data) - btrfs_merge_delalloc_extent(tree->private_data, - state, other); + if (tree->inode) + btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, + other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -357,8 +357,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) other = next_state(state); if (other && other->start == state->end + 1 && other->state == state->state) { - if (tree->private_data) - btrfs_merge_delalloc_extent(tree->private_data, state, + if (tree->inode) + btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); @@ -374,8 +374,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->private_data) - btrfs_set_delalloc_extent(tree->private_data, state, bits); + if (tree->inode) + btrfs_set_delalloc_extent(&tree->inode->vfs_inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -462,8 +462,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->private_data) - btrfs_split_delalloc_extent(tree->private_data, orig, split); + if (tree->inode) + btrfs_split_delalloc_extent(&tree->inode->vfs_inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -510,8 +510,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->private_data) - btrfs_clear_delalloc_extent(tree->private_data, state, bits); + if (tree->inode) + btrfs_clear_delalloc_extent(&tree->inode->vfs_inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 0e16642c28a3..cdee8c0854c8 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -80,7 +80,8 @@ enum { struct extent_io_tree { struct rb_root state; struct btrfs_fs_info *fs_info; - void *private_data; + /* Inode associated with this tree, or NULL. */ + struct btrfs_inode *inode; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 29c6c0ec0016..23ec33ac86e0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8872,7 +8872,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); - ei->io_tree.private_data = inode; + ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); ei->io_failure_tree = RB_ROOT; diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index ed50e81174bf..0bce0b4ff2fa 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1993,12 +1993,11 @@ TRACE_EVENT(btrfs_set_extent_bit, TP_fast_assign_btrfs(tree->fs_info, __entry->owner = tree->owner; - if (tree->private_data) { - const struct inode *inode = tree->private_data; + if (tree->inode) { + const struct btrfs_inode *inode = tree->inode; - __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->rootid = - BTRFS_I(inode)->root->root_key.objectid; + __entry->ino = btrfs_ino(inode); + __entry->rootid = inode->root->root_key.objectid; } else { __entry->ino = 0; __entry->rootid = 0; @@ -2032,12 +2031,11 @@ TRACE_EVENT(btrfs_clear_extent_bit, TP_fast_assign_btrfs(tree->fs_info, __entry->owner = tree->owner; - if (tree->private_data) { - const struct inode *inode = tree->private_data; + if (tree->inode) { + const struct btrfs_inode *inode = tree->inode; - __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->rootid = - BTRFS_I(inode)->root->root_key.objectid; + __entry->ino = btrfs_ino(inode); + __entry->rootid = inode->root->root_key.objectid; } else { __entry->ino = 0; __entry->rootid = 0; @@ -2072,12 +2070,11 @@ TRACE_EVENT(btrfs_convert_extent_bit, TP_fast_assign_btrfs(tree->fs_info, __entry->owner = tree->owner; - if (tree->private_data) { - const struct inode *inode = tree->private_data; + if (tree->inode) { + const struct btrfs_inode *inode = tree->inode; - __entry->ino = btrfs_ino(BTRFS_I(inode)); - __entry->rootid = - BTRFS_I(inode)->root->root_key.objectid; + __entry->ino = btrfs_ino(inode); + __entry->rootid = inode->root->root_key.objectid; } else { __entry->ino = 0; __entry->rootid = 0; From 2454151cdede9f6f3a2213ae84b07d9a9edb485e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 191/249] btrfs: pass btrfs_inode to btrfs_merge_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 6 ++---- fs/btrfs/inode.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 481c75c47fc4..130c95c6f7df 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -475,7 +475,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, u32 bits); -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index bbcc65593d1d..942212e1dbaf 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -347,8 +347,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) if (other && other->end == state->start - 1 && other->state == state->state) { if (tree->inode) - btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, - other); + btrfs_merge_delalloc_extent(tree->inode, state, other); state->start = other->start; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); @@ -358,8 +357,7 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state) if (other && other->start == state->end + 1 && other->state == state->state) { if (tree->inode) - btrfs_merge_delalloc_extent(&tree->inode->vfs_inode, state, - other); + btrfs_merge_delalloc_extent(tree->inode, state, other); state->end = other->end; rb_erase(&other->rb_node, &tree->state); RB_CLEAR_NODE(&other->rb_node); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 23ec33ac86e0..a2de63d38b1a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2314,10 +2314,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * that are just merged onto old extents, such as when we are doing sequential * writes, so we can properly account for the metadata space we'll need. */ -void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, +void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 new_size, old_size; u32 num_extents; @@ -2332,9 +2332,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, /* we're not bigger than the max, unreserve the space and go */ if (new_size <= fs_info->max_extent_size) { - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); return; } @@ -2363,9 +2363,9 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, if (count_max_extents(fs_info, new_size) >= num_extents) return; - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, -1); + spin_unlock(&inode->lock); } static void btrfs_add_delalloc_inodes(struct btrfs_root *root, From 4c5d166f6b3674473905ff133daea4de33f00b91 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 192/249] btrfs: pass btrfs_inode to btrfs_set_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/inode.c | 33 ++++++++++++++++----------------- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 130c95c6f7df..8a8280233199 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -471,7 +471,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); - void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, + void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, u32 bits); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 942212e1dbaf..1b4c52d7201d 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -373,7 +373,7 @@ static void set_state_bits(struct extent_io_tree *tree, int ret; if (tree->inode) - btrfs_set_delalloc_extent(&tree->inode->vfs_inode, state, bits); + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a2de63d38b1a..fae41b954373 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2421,10 +2421,10 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * Properly track delayed allocation bytes in the inode and to maintain the * list of inodes that have pending delalloc work to be done. */ -void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, +void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); @@ -2434,14 +2434,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * bit, which is only set or cleared with irqs on */ if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); - bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); + bool do_list = !btrfs_is_free_space_inode(inode); - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, num_extents); + spin_unlock(&inode->lock); /* For sanity tests */ if (btrfs_is_testing(fs_info)) @@ -2449,22 +2449,21 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, percpu_counter_add_batch(&fs_info->delalloc_bytes, len, fs_info->delalloc_batch); - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->delalloc_bytes += len; + spin_lock(&inode->lock); + inode->delalloc_bytes += len; if (bits & EXTENT_DEFRAG) - BTRFS_I(inode)->defrag_bytes += len; + inode->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, - &BTRFS_I(inode)->runtime_flags)) - btrfs_add_delalloc_inodes(root, BTRFS_I(inode)); - spin_unlock(&BTRFS_I(inode)->lock); + &inode->runtime_flags)) + btrfs_add_delalloc_inodes(root, inode); + spin_unlock(&inode->lock); } if (!(state->state & EXTENT_DELALLOC_NEW) && (bits & EXTENT_DELALLOC_NEW)) { - spin_lock(&BTRFS_I(inode)->lock); - BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - - state->start; - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + inode->new_delalloc_bytes += state->end + 1 - state->start; + spin_unlock(&inode->lock); } } From 62798a491561e1e1bbbd08873561532f19525fbf Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 193/249] btrfs: pass btrfs_inode to btrfs_split_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/inode.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 8a8280233199..ddf1867ba6d5 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -477,7 +477,7 @@ void btrfs_clear_delalloc_extent(struct inode *inode, struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); -void btrfs_split_delalloc_extent(struct inode *inode, +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 1b4c52d7201d..124aede5e492 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -461,7 +461,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node **node; if (tree->inode) - btrfs_split_delalloc_extent(&tree->inode->vfs_inode, orig, split); + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index fae41b954373..a43d7a063807 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2277,10 +2277,10 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page return ret; } -void btrfs_split_delalloc_extent(struct inode *inode, +void btrfs_split_delalloc_extent(struct btrfs_inode *inode, struct extent_state *orig, u64 split) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 size; /* not delalloc, ignore it */ @@ -2304,9 +2304,9 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; } - spin_lock(&BTRFS_I(inode)->lock); - btrfs_mod_outstanding_extents(BTRFS_I(inode), 1); - spin_unlock(&BTRFS_I(inode)->lock); + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); } /* From bd54766e40df1300564babbbc31866d36c0c4bb6 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 194/249] btrfs: pass btrfs_inode to btrfs_clear_delalloc_extent The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent-io-tree.c | 2 +- fs/btrfs/inode.c | 5 ++--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index ddf1867ba6d5..9e31dc8b0285 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -473,7 +473,7 @@ struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, struct inode *dir); void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); -void btrfs_clear_delalloc_extent(struct inode *inode, +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new, struct extent_state *other); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 124aede5e492..285b0ff6e953 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -509,7 +509,7 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, int ret; if (tree->inode) - btrfs_clear_delalloc_extent(&tree->inode->vfs_inode, state, bits); + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a43d7a063807..59ec9b2ef508 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2471,11 +2471,10 @@ void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *s * Once a range is no longer delalloc this function ensures that proper * accounting happens. */ -void btrfs_clear_delalloc_extent(struct inode *vfs_inode, +void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state, u32 bits) { - struct btrfs_inode *inode = BTRFS_I(vfs_inode); - struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 len = state->end + 1 - state->start; u32 num_extents = count_max_extents(fs_info, len); From 5b7544cb06fef21e4f8c189082a7bf02fd9832a1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 195/249] btrfs: pass btrfs_inode to btrfs_unlink_subvol The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 59ec9b2ef508..3577ee142888 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4470,9 +4470,9 @@ fscrypt_free: } static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, - struct inode *dir, struct dentry *dentry) + struct btrfs_inode *dir, struct dentry *dentry) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); struct btrfs_path *path; struct extent_buffer *leaf; @@ -4481,10 +4481,10 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, u64 index; int ret; u64 objectid; - u64 dir_ino = btrfs_ino(BTRFS_I(dir)); + u64 dir_ino = btrfs_ino(dir); struct fscrypt_name fname; - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret) return ret; @@ -4557,17 +4557,17 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, } } - ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index); + ret = btrfs_delete_delayed_dir_index(trans, dir, index); if (ret) { btrfs_abort_transaction(trans, ret); goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size - fname.disk_name.len * 2); - inode_inc_iversion(dir); - dir->i_mtime = current_time(dir); - dir->i_ctime = dir->i_mtime; - ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2); + inode_inc_iversion(&dir->vfs_inode); + dir->vfs_inode.i_mtime = current_time(&dir->vfs_inode); + dir->vfs_inode.i_ctime = dir->vfs_inode.i_mtime; + ret = btrfs_update_inode_fallback(trans, root, dir); if (ret) btrfs_abort_transaction(trans, ret); out: @@ -4758,7 +4758,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry) btrfs_record_snapshot_destroy(trans, dir); - ret = btrfs_unlink_subvol(trans, &dir->vfs_inode, dentry); + ret = btrfs_unlink_subvol(trans, dir, dentry); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -4862,7 +4862,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) } if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - err = btrfs_unlink_subvol(trans, dir, dentry); + err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry); goto out; } @@ -9208,7 +9208,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* src is a subvolume */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { /* src is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(old_dentry->d_inode), @@ -9223,7 +9223,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, /* dest is a subvolume */ if (new_ino == BTRFS_FIRST_FREE_OBJECTID) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); } else { /* dest is an inode */ ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir), BTRFS_I(new_dentry->d_inode), @@ -9470,7 +9470,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, BTRFS_I(old_inode), 1); if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, old_dir, old_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry); } else { ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir), BTRFS_I(d_inode(old_dentry)), @@ -9488,7 +9488,7 @@ static int btrfs_rename(struct user_namespace *mnt_userns, new_inode->i_ctime = current_time(new_inode); if (unlikely(btrfs_ino(BTRFS_I(new_inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) { - ret = btrfs_unlink_subvol(trans, new_dir, new_dentry); + ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry); BUG_ON(new_inode->i_nlink == 0); } else { ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir), From d1de429bcedac78d047c545d748c7b45fe16d56d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 196/249] btrfs: pass btrfs_inode to btrfs_inode_by_name The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3577ee142888..75a0e2350cdb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5565,12 +5565,12 @@ no_delete: * If no dir entries were found, returns -ENOENT. * If found a corrupted location in dir entry, returns -EUCLEAN. */ -static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, +static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5578,13 +5578,13 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, if (!path) return -ENOMEM; - ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret) goto out; /* This needs to handle no-key deletions later on */ - di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)), + di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), &fname.disk_name, 0); if (IS_ERR_OR_NULL(di)) { ret = di ? PTR_ERR(di) : -ENOENT; @@ -5597,7 +5597,7 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, ret = -EUCLEAN; btrfs_warn(root->fs_info, "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))", - __func__, fname.disk_name.name, btrfs_ino(BTRFS_I(dir)), + __func__, fname.disk_name.name, btrfs_ino(dir), location->objectid, location->type, location->offset); } if (!ret) @@ -5881,7 +5881,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (dentry->d_name.len > BTRFS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - ret = btrfs_inode_by_name(dir, dentry, &location, &di_type); + ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type); if (ret < 0) return ERR_PTR(ret); From 3c1b1c4c0e874dc9f2cf2faa231d39c4ca0d5888 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 197/249] btrfs: pass btrfs_inode to fixup_tree_root_location The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 75a0e2350cdb..a14a2b3a928b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5614,7 +5614,7 @@ out: * is kind of like crossing a mount point. */ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, - struct inode *dir, + struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, struct btrfs_root **sub_root) @@ -5628,7 +5628,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, int err = 0; struct fscrypt_name fname; - ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname); + ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname); if (ret) return ret; @@ -5639,7 +5639,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, } err = -ENOENT; - key.objectid = BTRFS_I(dir)->root->root_key.objectid; + key.objectid = dir->root->root_key.objectid; key.type = BTRFS_ROOT_REF_KEY; key.offset = location->objectid; @@ -5652,7 +5652,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, leaf = path->nodes[0]; ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); - if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) || + if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) || btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len) goto out; @@ -5902,7 +5902,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) return inode; } - ret = fixup_tree_root_location(fs_info, dir, dentry, + ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, &location, &sub_root); if (ret < 0) { if (ret != -ENOENT) From 4c45a4f4de1b7083800954afa4821c67df157967 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 198/249] btrfs: pass btrfs_inode to inode_tree_add The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a14a2b3a928b..b1a4f53cc2c4 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5680,16 +5680,16 @@ out: return err; } -static void inode_tree_add(struct inode *inode) +static void inode_tree_add(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_inode *entry; struct rb_node **p; struct rb_node *parent; - struct rb_node *new = &BTRFS_I(inode)->rb_node; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct rb_node *new = &inode->rb_node; + u64 ino = btrfs_ino(inode); - if (inode_unhashed(inode)) + if (inode_unhashed(&inode->vfs_inode)) return; parent = NULL; spin_lock(&root->inode_lock); @@ -5801,7 +5801,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, ret = btrfs_read_locked_inode(inode, path); if (!ret) { - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); unlock_new_inode(inode); } else { iget_failed(inode); @@ -6568,7 +6568,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } - inode_tree_add(inode); + inode_tree_add(BTRFS_I(inode)); trace_btrfs_inode_new(inode); btrfs_set_inode_last_trans(trans, BTRFS_I(inode)); From 7a0443f031a68188a38e0845686747d4ccd2c16d Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 27 Oct 2022 02:41:32 +0200 Subject: [PATCH 199/249] btrfs: pass btrfs_inode to btrfs_inherit_iflags The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b1a4f53cc2c4..a242417a7e18 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6345,27 +6345,27 @@ void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) * * Currently only the compression flags and the cow flags are inherited. */ -static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) +static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir) { unsigned int flags; - flags = BTRFS_I(dir)->flags; + flags = dir->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags |= BTRFS_INODE_NOCOMPRESS; } else if (flags & BTRFS_INODE_COMPRESS) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; } if (flags & BTRFS_INODE_NODATACOW) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW; - if (S_ISREG(inode->i_mode)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + inode->flags |= BTRFS_INODE_NODATACOW; + if (S_ISREG(inode->vfs_inode.i_mode)) + inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(inode); + btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6424,7 +6424,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, * change it now without compatibility issues. */ if (!args->subvol) - btrfs_inherit_iflags(inode, dir); + btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir)); if (S_ISREG(inode->i_mode)) { if (btrfs_test_opt(fs_info, NODATASUM)) From 99a81a444448ef6cde906500dbdf26aa5961e291 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:37:50 +0200 Subject: [PATCH 200/249] btrfs: switch async_chunk::inode to btrfs_inode The async_chunk::inode structure is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a242417a7e18..d2409d8a147a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -520,7 +520,7 @@ struct async_extent { }; struct async_chunk { - struct inode *inode; + struct btrfs_inode *inode; struct page *locked_page; u64 start; u64 end; @@ -648,7 +648,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, */ static noinline int compress_file_range(struct async_chunk *async_chunk) { - struct inode *inode = async_chunk->inode; + struct inode *inode = &async_chunk->inode->vfs_inode; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; @@ -1113,7 +1113,7 @@ out_free: */ static noinline void submit_compressed_extents(struct async_chunk *async_chunk) { - struct btrfs_inode *inode = BTRFS_I(async_chunk->inode); + struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct async_extent *async_extent; u64 alloc_hint = 0; @@ -1494,7 +1494,7 @@ static noinline void async_cow_start(struct btrfs_work *work) compressed_extents = compress_file_range(async_chunk); if (compressed_extents == 0) { - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); async_chunk->inode = NULL; } } @@ -1534,7 +1534,7 @@ static noinline void async_cow_free(struct btrfs_work *work) async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) - btrfs_add_delayed_iput(async_chunk->inode); + btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -1602,7 +1602,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, */ ihold(&inode->vfs_inode); async_chunk[i].async_cow = ctx; - async_chunk[i].inode = &inode->vfs_inode; + async_chunk[i].inode = inode; async_chunk[i].start = start; async_chunk[i].end = cur_end; async_chunk[i].write_flags = write_flags; From 99a01bd6388e52dabb56015b939c1e9b26b0196e Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:47:16 +0200 Subject: [PATCH 201/249] btrfs: use btrfs_inode inside compress_file_range The function is mostly using internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d2409d8a147a..28725dd76db9 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -648,8 +648,8 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, */ static noinline int compress_file_range(struct async_chunk *async_chunk) { - struct inode *inode = &async_chunk->inode->vfs_inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = async_chunk->inode; + struct btrfs_fs_info *fs_info = inode->root->fs_info; u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -666,8 +666,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) int compressed_extents = 0; int redirty = 0; - inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1, - SZ_16K); + inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); /* * We need to save i_size before now because it could change in between @@ -679,7 +678,7 @@ static noinline int compress_file_range(struct async_chunk *async_chunk) * does that for us. */ barrier(); - i_size = i_size_read(inode); + i_size = i_size_read(&inode->vfs_inode); barrier(); actual_end = min_t(u64, i_size, end + 1); again: @@ -708,7 +707,7 @@ again: * isn't an inline extent, since it doesn't save disk space at all. */ if (total_compressed <= blocksize && - (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) + (start > 0 || end + 1 < inode->disk_i_size)) goto cleanup_and_bail_uncompressed; /* @@ -732,7 +731,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (inode_need_compress(BTRFS_I(inode), start, end)) { + if (inode_need_compress(inode, start, end)) { WARN_ON(pages); pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) { @@ -741,10 +740,10 @@ again: goto cont; } - if (BTRFS_I(inode)->defrag_compress) - compress_type = BTRFS_I(inode)->defrag_compress; - else if (BTRFS_I(inode)->prop_compress) - compress_type = BTRFS_I(inode)->prop_compress; + if (inode->defrag_compress) + compress_type = inode->defrag_compress; + else if (inode->prop_compress) + compress_type = inode->prop_compress; /* * we need to call clear_page_dirty_for_io on each @@ -759,14 +758,14 @@ again: * has moved, the end is the original one. */ if (!redirty) { - extent_range_clear_dirty_for_io(inode, start, end); + extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); redirty = 1; } /* Compression level is applied here and only here */ ret = btrfs_compress_pages( compress_type | (fs_info->compress_level << 4), - inode->i_mapping, start, + inode->vfs_inode.i_mapping, start, pages, &nr_pages, &total_in, @@ -795,12 +794,12 @@ cont: /* we didn't compress the entire range, try * to make an uncompressed inline extent. */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, 0, BTRFS_COMPRESS_NONE, NULL, false); } else { /* try making a compressed inline extent */ - ret = cow_file_range_inline(BTRFS_I(inode), actual_end, + ret = cow_file_range_inline(inode, actual_end, total_compressed, compress_type, pages, false); @@ -823,7 +822,7 @@ cont: * our outstanding extent for clearing delalloc for this * range. */ - extent_clear_unlock_delalloc(BTRFS_I(inode), start, end, + extent_clear_unlock_delalloc(inode, start, end, NULL, clear_flags, PAGE_UNLOCK | @@ -898,8 +897,8 @@ cont: /* flag the file so we don't compress in the future */ if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && - !(BTRFS_I(inode)->prop_compress)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + !(inode->prop_compress)) { + inode->flags |= BTRFS_INODE_NOCOMPRESS; } } cleanup_and_bail_uncompressed: @@ -917,7 +916,7 @@ cleanup_and_bail_uncompressed: } if (redirty) - extent_range_redirty_for_io(inode, start, end); + extent_range_redirty_for_io(&inode->vfs_inode, start, end); add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0, BTRFS_COMPRESS_NONE); compressed_extents++; From 5fc24314c89491cb2f53583d5e513731d58c7699 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:47:16 +0200 Subject: [PATCH 202/249] btrfs: use btrfs_inode inside btrfs_verify_data_csum The function is mostly using internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 28725dd76db9..d3e5a5ff2d5c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3529,10 +3529,10 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct extent_io_tree *io_tree = &inode->io_tree; const u32 sectorsize = root->fs_info->sectorsize; u32 pg_off; unsigned int result = 0; @@ -3545,7 +3545,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, if (bbio->csum == NULL) return 0; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) + if (inode->flags & BTRFS_INODE_NODATASUM) return 0; if (unlikely(test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state))) @@ -3569,7 +3569,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = btrfs_check_data_csum(BTRFS_I(inode), bbio, bio_offset, page, pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; From e55cf7ca85e323028774feeb117ad94358a78070 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 28 Oct 2022 03:53:04 +0200 Subject: [PATCH 203/249] btrfs: pass btrfs_inode to btrfs_add_delayed_iput The function is for internal interfaces so we should use the btrfs_inode. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/free-space-cache.c | 4 ++-- fs/btrfs/inode.c | 19 +++++++++---------- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/relocation.c | 4 ++-- fs/btrfs/tree-log.c | 24 ++++++++++++------------ 7 files changed, 28 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 9e31dc8b0285..195c09e20609 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -501,7 +501,7 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size); -void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_add_delayed_iput(struct btrfs_inode *inode); void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info); int btrfs_prealloc_file_range(struct inode *inode, int mode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 05768f7f7872..859a41624c31 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3228,7 +3228,7 @@ retry: if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) mapping->writeback_index = done_index; - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 627bd6120368..0d250d052487 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -260,7 +260,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); goto out; } clear_nlink(inode); @@ -274,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, spin_unlock(&block_group->lock); } /* One for the lookup ref */ - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); key.objectid = BTRFS_FREE_SPACE_OBJECTID; key.type = 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d3e5a5ff2d5c..aec1b232a71c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1493,7 +1493,7 @@ static noinline void async_cow_start(struct btrfs_work *work) compressed_extents = compress_file_range(async_chunk); if (compressed_extents == 0) { - btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); + btrfs_add_delayed_iput(async_chunk->inode); async_chunk->inode = NULL; } } @@ -1533,7 +1533,7 @@ static noinline void async_cow_free(struct btrfs_work *work) async_chunk = container_of(work, struct async_chunk, work); if (async_chunk->inode) - btrfs_add_delayed_iput(&async_chunk->inode->vfs_inode); + btrfs_add_delayed_iput(async_chunk->inode); if (async_chunk->blkcg_css) css_put(async_chunk->blkcg_css); @@ -3016,7 +3016,7 @@ out_page: * that could need flushing space. Recursing back to fixup worker would * deadlock. */ - btrfs_add_delayed_iput(&inode->vfs_inode); + btrfs_add_delayed_iput(inode); } /* @@ -3590,18 +3590,17 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, * the inode to the delayed iput machinery. Delayed iputs are processed at * transaction commit time/superblock commit/cleaner kthread. */ -void btrfs_add_delayed_iput(struct inode *inode) +void btrfs_add_delayed_iput(struct btrfs_inode *inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_inode *binode = BTRFS_I(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; - if (atomic_add_unless(&inode->i_count, -1, 1)) + if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); spin_lock(&fs_info->delayed_iput_lock); - ASSERT(list_empty(&binode->delayed_iput)); - list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + ASSERT(list_empty(&inode->delayed_iput)); + list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); spin_unlock(&fs_info->delayed_iput_lock); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); @@ -9661,7 +9660,7 @@ static int start_delalloc_inodes(struct btrfs_root *root, &work->work); } else { ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret || wbc->nr_to_write <= 0) goto out; } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 8fda1949b71b..4bed0839b640 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -504,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) - btrfs_add_delayed_iput(entry->inode); + btrfs_add_delayed_iput(BTRFS_I(entry->inode)); while (!list_empty(&entry->list)) { cur = entry->list.next; sum = list_entry(cur, struct btrfs_ordered_sum, list); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 1440cb332a5a..8914aa920bb7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1117,7 +1117,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, inode = find_next_inode(root, key.objectid); first = 0; } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); inode = find_next_inode(root, key.objectid); } if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) { @@ -1181,7 +1181,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, if (dirty) btrfs_mark_buffer_dirty(leaf); if (inode) - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return ret; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index b6e99ef99679..f7b1bb9c63e4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5473,7 +5473,7 @@ again: } if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); break; } @@ -5482,7 +5482,7 @@ again: log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5676,11 +5676,11 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * so that the log ends up with the new name and without the old name. */ if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); return 0; } - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5755,7 +5755,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) break; continue; @@ -5772,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * that, we can avoid doing it again. */ if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); continue; } @@ -5784,7 +5784,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) break; } @@ -6294,7 +6294,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); continue; } @@ -6307,7 +6307,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); - btrfs_add_delayed_iput(di_inode); + btrfs_add_delayed_iput(BTRFS_I(di_inode)); if (ret) break; @@ -6768,7 +6768,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, } if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); continue; } @@ -6778,7 +6778,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (!ret && ctx->log_new_dentries) ret = log_new_dir_dentries(trans, BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(dir_inode); + btrfs_add_delayed_iput(BTRFS_I(dir_inode)); if (ret) goto out; } @@ -6823,7 +6823,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, need_log_inode(trans, BTRFS_I(inode))) ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(inode); + btrfs_add_delayed_iput(BTRFS_I(inode)); if (ret) return ret; From 2942a50dea74126bf3395b3060a808fb046136fc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 7 Nov 2022 15:32:29 +0800 Subject: [PATCH 204/249] btrfs: raid56: introduce btrfs_raid_bio::error_bitmap Currently btrfs raid56 uses btrfs_raid_bio::faila and failb to indicate which stripe(s) had IO errors. But that has some problems: - If one sector failed csum check, the whole stripe where the corruption is will be marked error. This can reduce the chance we do recover, like this: 0 4K 8K Data 1 |XX| | Data 2 | |XX| Parity | | | In above case, 0~4K in data 1 should be recovered using data 2 and parity, while 4K~8K in data 2 should be recovered using data 1 and parity. Currently if we trigger read on 0~4K of data 1, we will also recover 4K~8K of data 1 using corrupted data 2 and parity, causing wrong result in rbio cache. - Harder to expand for future M-N scheme As we're limited to just faila/b, two corruptions. - Harder to expand to handle extra csum errors This can be problematic if we start to do csum verification. This patch will introduce an extra @error_bitmap, where one bit represents error that happened for that sector. The choice to introduce a new error bitmap other than reusing sector_ptr, is to avoid extra search between rbio::stripe_sectors[] and rbio::bio_sectors[]. Since we can submit bio using sectors from both sectors, doing proper search on both array will more complex. Although the new bitmap will take extra memory, later we can remove things like @error and faila/b to save some memory. Currently the new error bitmap and failab mechanism coexists, the error bitmap is only updated at endio time and recover entrance. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 99 +++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/raid56.h | 11 ++++++ 2 files changed, 103 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 629237102189..e14968a37c08 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -76,6 +76,7 @@ static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) { + bitmap_free(rbio->error_bitmap); kfree(rbio->stripe_pages); kfree(rbio->bio_sectors); kfree(rbio->stripe_sectors); @@ -950,9 +951,10 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), GFP_NOFS); rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || - !rbio->finish_pointers) { + !rbio->finish_pointers || !rbio->error_bitmap) { free_raid_bio_pointers(rbio); kfree(rbio); return ERR_PTR(-ENOMEM); @@ -1044,8 +1046,11 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) + if (!stripe->dev->bdev) { + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); return fail_rbio_index(rbio, stripe_nr); + } /* see if we can add this page onto our existing bio */ if (last) { @@ -1209,6 +1214,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, * write. */ atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); rbio->faila = -1; rbio->failb = -1; @@ -1332,6 +1338,40 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, return -1; } +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + int total_nr_sector = offset >> fs_info->sectorsize_bits; + + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); + + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); + + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } + } + ASSERT(found_missing); + } +} + /* * returns -EIO if we had too many failures */ @@ -1423,14 +1463,49 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + struct bio_vec *bv = bio_first_bvec_all(bio); + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector; + + sector = &rbio->stripe_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + sector = &rbio->bio_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; +} + +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) + bio_size += bvec->bv_len; + + bitmap_set(rbio->error_bitmap, total_sector_nr, + bio_size >> rbio->bioc->fs_info->sectorsize_bits); +} + static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_status) + if (bio->bi_status) { fail_bio_stripe(rbio, bio); - else + rbio_update_error_bitmap(rbio, bio); + } else { set_bio_pages_uptodate(rbio, bio); + } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -1863,10 +1938,10 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->error); /* Skip the current stripe. */ ASSERT(sectornr == 0); total_sector_nr += rbio->stripe_nsectors - 1; + atomic_inc(&rbio->error); continue; } sector = rbio_stripe_sector(rbio, stripe, sectornr); @@ -1891,9 +1966,10 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) /* * Either we're doing recover for a read failure or degraded write, - * caller should have set faila/b correctly. + * caller should have set faila/b and error bitmap correctly. */ ASSERT(rbio->faila >= 0 || rbio->failb >= 0); + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); bio_list_init(&bio_list); /* @@ -1978,6 +2054,8 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); + set_rbio_range_error(rbio, bio); + rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn(fs_info, @@ -2038,8 +2116,10 @@ static void raid_wait_write_end_io(struct bio *bio) struct btrfs_raid_bio *rbio = bio->bi_private; blk_status_t err = bio->bi_status; - if (err) + if (err) { fail_bio_stripe(rbio, bio); + rbio_update_error_bitmap(rbio, bio); + } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); @@ -2117,6 +2197,7 @@ write: spin_unlock_irq(&rbio->bio_list_lock); atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); index_rbio_pages(rbio); @@ -2328,6 +2409,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) } atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); @@ -2533,6 +2615,8 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + ret = scrub_assemble_read_bios(rbio, &bio_list); if (ret < 0) goto cleanup; @@ -2612,6 +2696,7 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) */ ASSERT(!bio->bi_iter.bi_size); + set_rbio_range_error(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { btrfs_warn_rl(fs_info, diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 9fae97b7a2a5..e38da4fa76d6 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -126,6 +126,17 @@ struct btrfs_raid_bio { /* Allocated with real_stripes-many pointers for finish_*() calls */ void **finish_pointers; + + /* + * The bitmap recording where IO errors happened. + * Each bit is corresponding to one sector in either bio_sectors[] or + * stripe_sectors[] array. + * + * The reason we don't use another bit in sector_ptr is, we have two + * arrays of sectors, and a lot of IO can use sectors in both arrays. + * Thus making it much harder to iterate. + */ + unsigned long *error_bitmap; }; /* From 75b47033296595efb208cc563cbb8cf4fb7c3ebc Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 7 Nov 2022 15:32:30 +0800 Subject: [PATCH 205/249] btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap Since we have rbio::error_bitmap to indicate exactly where the errors are (including read error and csum mismatch error), we can make recovery path more accurate. For example: 0 32K 64K Data 1 |XXXXXXXX| | Data 2 | |XXXXXXXXX| Parity | | | 1) Get csum mismatch when reading data 1 [0, 32K) 2) Mark corresponding range error The old code will mark the whole data 1 stripe as error. While the new code will only mark data 1 [0, 32K) as error. 3) Recovery path The old code will recover data 1 [0, 64K), all using Data 2 and parity. This means, Data 1 [32K, 64K) will be corrupted data, as data 2 [32K, 64K) is already corrupted. While the new code will only recover data 1 [0, 32K), as only that range has error so far. This new behavior can avoid populating rbio cache with incorrect data. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 281 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 194 insertions(+), 87 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index e14968a37c08..fd67ba4648c9 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) return 0; } +/* + * Return the total numer of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + return found_errors; +} + /* * Add a single sector @sector into our list of bios for IO. * @@ -1740,14 +1770,15 @@ fail: * @*pointers are the pre-allocated pointers by the caller, so we don't * need to allocate/free the pointers again and again. */ -static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, - void **pointers, void **unmap_array) +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; struct sector_ptr *sector; const u32 sectorsize = fs_info->sectorsize; - const int faila = rbio->faila; - const int failb = rbio->failb; + int found_errors; + int faila; + int failb; int stripe_nr; /* @@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && !test_bit(sector_nr, &rbio->dbitmap)) - return; + return 0; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the veritical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + + if (found_errors > rbio->bioc->max_errors) + return -EIO; /* * Setup our array of pointers with sectors from each stripe @@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, */ for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* - * If we're rebuilding a read, we have to use - * pages from the bio list + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe_nr == faila || stripe_nr == failb)) { + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); @@ -1859,18 +1901,19 @@ pstripe: * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. */ - if (rbio->faila >= 0) { - sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr); + if (faila >= 0) { + sector = rbio_stripe_sector(rbio, faila, sector_nr); sector->uptodate = 1; } - if (rbio->failb >= 0) { - sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr); + if (failb >= 0) { + sector = rbio_stripe_sector(rbio, failb, sector_nr); sector->uptodate = 1; } cleanup: for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) kunmap_local(unmap_array[stripe_nr]); + return 0; } static int recover_sectors(struct btrfs_raid_bio *rbio) @@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) goto out; } - /* Make sure faila and fail b are in order. */ - if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb) - swap(rbio->faila, rbio->failb); - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { spin_lock_irq(&rbio->bio_list_lock); @@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) - recover_vertical(rbio, sectornr, pointers, unmap_array); + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; + } out: kfree(pointers); @@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; - if (rbio->faila == stripe || rbio->failb == stripe) { - /* Skip the current stripe. */ - ASSERT(sectornr == 0); - total_sector_nr += rbio->stripe_nsectors - 1; - atomic_inc(&rbio->error); + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { + /* + * Also set the error bit for missing device, which + * may not yet have its error bit set. + */ + set_bit(total_sector_nr, rbio->error_bitmap); continue; } + sector = rbio_stripe_sector(rbio, stripe, sectornr); ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); @@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) /* * Either we're doing recover for a read failure or degraded write, - * caller should have set faila/b and error bitmap correctly. + * caller should have set error bitmap correctly. */ - ASSERT(rbio->faila >= 0 || rbio->failb >= 0); ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); bio_list_init(&bio_list); @@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) submit_read_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - /* We have more errors than our tolerance during the read. */ - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - ret = -EIO; - goto out; - } - ret = recover_sectors(rbio); out: @@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work) rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; + + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; + + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); +} + /* * the main entry point for reads from the higher layers. This * is really only called when the normal read path had a failure, @@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, * for 'mirror_num > 2', select a stripe to fail on every retry. */ if (mirror_num > 2) { - /* - * 'mirror == 3' is to fail the p stripe and - * reconstruct from the q stripe. 'mirror > 3' is to - * fail a data stripe and reconstruct from p+q stripe. - */ + set_rbio_raid6_extra_error(rbio, mirror_num); rbio->failb = rbio->real_stripes - (mirror_num - 1); ASSERT(rbio->failb > 0); if (rbio->failb <= rbio->faila) @@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - int dfail = 0, failp = -1; + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; int ret; - /* No error case should be already handled by the caller. */ - ASSERT(rbio->faila >= 0 || rbio->failb >= 0); - - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; - - if (is_data_stripe(rbio, rbio->failb)) - dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; - /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ - if (dfail > rbio->bioc->max_errors - 1) - return -EIO; + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } - /* - * If all data is good, only parity is correctly, just - * repair the parity. - */ - if (dfail == 0) - return 0; + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; - /* - * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. - */ - if (failp != rbio->scrubp) - return -EIO; + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; - /* We have some corrupted sectors, need to repair them. */ - ret = recover_sectors(rbio); + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); + + if (is_data_stripe(rbio, faila)) + dfail++; + else if (is_parity_stripe(faila)) + failp = faila; + + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; + /* + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) + */ + if (dfail > rbio->bioc->max_errors - 1) { + ret = -EIO; + goto out; + } + /* + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. + */ + if (dfail == 0) + continue; + + /* + * Here means we got one corrupted data stripe and one + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. + */ + if (failp != rbio->scrubp) { + ret = -EIO; + goto out; + } + + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; + } +out: + kfree(pointers); + kfree(unmap_array); return ret; } @@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) submit_read_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - ret = -EIO; - goto cleanup; - } - /* - * No error during read, can finish the scrub and need to verify the - * P/Q sectors; - */ - if (atomic_read(&rbio->error) == 0) { - need_check = true; - goto finish; - } - - /* We have some failures, need to recover the failed sectors first. */ + /* We may have some failures, recover the failed sectors first. */ ret = recover_scrub_rbio(rbio); if (ret < 0) goto cleanup; -finish: /* * We have every sector properly prepared. Can finish the scrub * and writeback the good content. From ad3daf1c3f5bcbba49a302d8d0e46467556bf6f3 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 7 Nov 2022 15:32:31 +0800 Subject: [PATCH 206/249] btrfs: raid56: remove the old error tracking system Since all the recovery paths have been migrated to the new error bitmap based system, we can remove the old stripe number based system. This cleanup involves one behavior change: - Rebuild rbio can no longer be merged Previously a rebuild rbio (caused by retry after data csum mismatch) can be merged, if the error happens in the same stripe. But with the new error bitmap based solution, it's much harder to compare error bitmaps. So here we just don't merge rebuild rbio at all. This may introduce some performance impact at extreme corner cases, but we're willing to take it. Other than that, this patch will cleanup the following members: - rbio::faila - rbio::failb They will be replaced by per-vertical stripe check, which is more accurate. - rbio::error It will be replace by per-vertical stripe error bitmap check. - Allow get_rbio_vertical_errors() to accept NULL pointers for @faila and @failb Some call sites only want to check if we have errors beyond the tolerance. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 234 +++++++++++----------------------------------- fs/btrfs/raid56.h | 8 -- 2 files changed, 53 insertions(+), 189 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index fd67ba4648c9..11be5d0a7eab 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -66,8 +66,6 @@ struct sector_ptr { static void rmw_rbio_work(struct work_struct *work); static void rmw_rbio_work_locked(struct work_struct *work); -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); @@ -588,28 +586,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + last->operation == BTRFS_RBIO_READ_REBUILD) return 0; - if (last->operation == BTRFS_RBIO_READ_REBUILD) { - int fa = last->faila; - int fb = last->failb; - int cur_fa = cur->faila; - int cur_fb = cur->failb; - - if (last->faila >= last->failb) { - fa = last->failb; - fb = last->faila; - } - - if (cur->faila >= cur->failb) { - cur_fa = cur->failb; - cur_fb = cur->faila; - } - - if (fa != cur_fa || fb != cur_fb) - return 0; - } return 1; } @@ -973,10 +953,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; - rbio->faila = -1; - rbio->failb = -1; refcount_set(&rbio->refs, 1); - atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); @@ -1025,19 +1002,28 @@ static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, int stripe_nr; int found_errors = 0; - ASSERT(faila && failb); - *faila = -1; - *failb = -1; + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + } for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; if (test_bit(total_sector_nr, rbio->error_bitmap)) { found_errors++; - if (*faila < 0) - *faila = stripe_nr; - else if (*failb < 0) - *failb = stripe_nr; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } } } return found_errors; @@ -1077,9 +1063,17 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) { + int found_errors; + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, rbio->error_bitmap); - return fail_rbio_index(rbio, stripe_nr); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + NULL, NULL); + if (found_errors > rbio->bioc->max_errors) + return -EIO; + return 0; } /* see if we can add this page onto our existing bio */ @@ -1243,10 +1237,7 @@ static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, * Reset errors, as we may have errors inherited from from degraded * write. */ - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); - rbio->faila = -1; - rbio->failb = -1; /* * Start assembly. Make bios for everything from the higher layers (the @@ -1324,50 +1315,6 @@ error: return -EIO; } -/* - * helper to find the stripe number for a given bio. Used to figure out which - * stripe has failed. This expects the bio to correspond to a physical disk, - * so it looks up based on physical sector numbers. - */ -static int find_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 physical = bio->bi_iter.bi_sector; - int i; - struct btrfs_io_stripe *stripe; - - physical <<= 9; - - for (i = 0; i < rbio->bioc->num_stripes; i++) { - stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && - stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { - return i; - } - } - return -1; -} - -/* - * helper to find the stripe number for a given - * bio (before mapping). Used to figure out which stripe has - * failed. This looks up based on logical block numbers. - */ -static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - int i; - - for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bioc->raid_map[i]; - - if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) - return i; - } - return -1; -} - static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; @@ -1402,52 +1349,6 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) } } -/* - * returns -EIO if we had too many failures - */ -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) -{ - unsigned long flags; - int ret = 0; - - spin_lock_irqsave(&rbio->bio_list_lock, flags); - - /* we already know this stripe is bad, move on */ - if (rbio->faila == failed || rbio->failb == failed) - goto out; - - if (rbio->faila == -1) { - /* first failure on this rbio */ - rbio->faila = failed; - atomic_inc(&rbio->error); - } else if (rbio->failb == -1) { - /* second failure on this rbio */ - rbio->failb = failed; - atomic_inc(&rbio->error); - } else { - ret = -EIO; - } -out: - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); - - return ret; -} - -/* - * helper to fail a stripe based on a physical disk - * bio. - */ -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - int failed = find_bio_stripe(rbio, bio); - - if (failed < 0) - return -EIO; - - return fail_rbio_index(rbio, failed); -} - /* * For subpage case, we can no longer set page Uptodate directly for * stripe_pages[], thus we need to locate the sector. @@ -1530,12 +1431,10 @@ static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_status) { - fail_bio_stripe(rbio, bio); + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); - } else { + else set_bio_pages_uptodate(rbio, bio); - } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2021,12 +1920,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio) ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); bio_list_init(&bio_list); - /* - * Reset error to 0, as we will later increase error for missing - * devices. - */ - atomic_set(&rbio->error, 0); - /* For recovery, we need to read all sectors including P/Q. */ ret = alloc_rbio_pages(rbio); if (ret < 0) @@ -2144,30 +2037,13 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, set_rbio_range_error(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn(fs_info, -"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", - __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bioc->map_type); - free_raid_bio(rbio); - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - return; - } - /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num > 2) { + if (mirror_num > 2) set_rbio_raid6_extra_error(rbio, mirror_num); - rbio->failb = rbio->real_stripes - (mirror_num - 1); - ASSERT(rbio->failb > 0); - if (rbio->failb <= rbio->faila) - rbio->failb--; - } start_async_work(rbio, recover_rbio_work); } @@ -2179,7 +2055,6 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) int ret; bio_list_init(&bio_list); - atomic_set(&rbio->error, 0); ret = rmw_assemble_read_bios(rbio, &bio_list); if (ret < 0) @@ -2200,10 +2075,8 @@ static void raid_wait_write_end_io(struct bio *bio) struct btrfs_raid_bio *rbio = bio->bi_private; blk_status_t err = bio->bi_status; - if (err) { - fail_bio_stripe(rbio, bio); + if (err) rbio_update_error_bitmap(rbio, bio); - } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) wake_up(&rbio->io_wait); @@ -2253,19 +2126,14 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) if (ret < 0) return ret; - atomic_set(&rbio->error, 0); index_rbio_pages(rbio); ret = rmw_read_and_wait(rbio); if (ret < 0) return ret; - /* Too many read errors, beyond our tolerance. */ - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - return ret; - - /* Have read failures but under tolerance, needs recovery. */ - if (rbio->faila >= 0 || rbio->failb >= 0) { + /* We have read errors, try recovery path. */ + if (!bitmap_empty(rbio->error_bitmap, rbio->nr_sectors)) { ret = recover_rbio(rbio); if (ret < 0) return ret; @@ -2280,7 +2148,6 @@ write: set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); spin_unlock_irq(&rbio->bio_list_lock); - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); index_rbio_pages(rbio); @@ -2309,9 +2176,16 @@ write: submit_write_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - /* We have more errors than our tolerance during the read. */ - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - ret = -EIO; + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } return ret; } @@ -2492,7 +2366,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ @@ -2726,6 +2599,7 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) { bool need_check = false; struct bio_list bio_list; + int sector_nr; int ret; struct bio *bio; @@ -2735,7 +2609,6 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) if (ret) goto cleanup; - atomic_set(&rbio->error, 0); bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); ret = scrub_assemble_read_bios(rbio, &bio_list); @@ -2756,8 +2629,15 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio) */ ret = finish_parity_scrub(rbio, need_check); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - ret = -EIO; + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } return ret; cleanup: @@ -2804,14 +2684,6 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) ASSERT(!bio->bi_iter.bi_size); set_rbio_range_error(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn_rl(fs_info, - "can not determine the failed stripe number for full stripe %llu", - bioc->raid_map[0]); - free_raid_bio(rbio); - return NULL; - } return rbio; } diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index e38da4fa76d6..a2e653e93fd8 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -74,12 +74,6 @@ struct btrfs_raid_bio { /* How many sectors there are for each stripe */ u8 stripe_nsectors; - /* First bad stripe, -1 means no corruption */ - s8 faila; - - /* Second bad stripe (for RAID6 use) */ - s8 failb; - /* Stripe number that we're scrubbing */ u8 scrubp; @@ -93,8 +87,6 @@ struct btrfs_raid_bio { atomic_t stripes_pending; - atomic_t error; - wait_queue_head_t io_wait; /* Bitmap to record which horizontal stripe has data */ From 3e09b5b2293f21e6e28929b6bbb73833678bfdd1 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 7 Nov 2022 17:30:21 +0100 Subject: [PATCH 207/249] btrfs: constify input buffer parameter in compression code The input buffers passed down to compression must never be changed, switch type to u8 as it's a raw byte buffer and use const. Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/compression.c | 4 ++-- fs/btrfs/compression.h | 8 ++++---- fs/btrfs/lzo.c | 2 +- fs/btrfs/zlib.c | 2 +- fs/btrfs/zstd.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 42e6dde2ad59..30adf430241e 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -119,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws, } static int compression_decompress(int type, struct list_head *ws, - unsigned char *data_in, struct page *dest_page, + const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { switch (type) { @@ -1232,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb) * single page, and we want to read a single page out of it. * start_byte tells us the offset into the compressed data we're interested in */ -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { struct list_head *workspace; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index b961462399ae..6209d40a1e08 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -86,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); -int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page, +int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); int btrfs_decompress_buf2page(const char *buf, u32 buf_len, struct compressed_bio *cb, u32 decompressed); @@ -150,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *zlib_alloc_workspace(unsigned int level); @@ -161,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); struct list_head *lzo_alloc_workspace(unsigned int level); @@ -171,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, u64 start, struct page **pages, unsigned long *out_pages, unsigned long *total_in, unsigned long *total_out); int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen); void zstd_init_workspace_manager(void); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index e7b1ceffcd33..d5e78cbc8fbc 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -427,7 +427,7 @@ out: return ret; } -int lzo_decompress(struct list_head *ws, unsigned char *data_in, +int lzo_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c5275cb23837..01a13de11832 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -355,7 +355,7 @@ done: return ret; } -int zlib_decompress(struct list_head *ws, unsigned char *data_in, +int zlib_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 4575b3703e74..e34f1ab99d56 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -616,7 +616,7 @@ done: return ret; } -int zstd_decompress(struct list_head *ws, unsigned char *data_in, +int zstd_decompress(struct list_head *ws, const u8 *data_in, struct page *dest_page, unsigned long start_byte, size_t srclen, size_t destlen) { From bb21e30260a672172a26ee1626dc1463215cf18c Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Mon, 7 Nov 2022 23:07:17 +0800 Subject: [PATCH 208/249] btrfs: move device->name RCU allocation and assign to btrfs_alloc_device() There is a repeating code section in the parent function after calling btrfs_alloc_device(), as below: name = rcu_string_strdup(path, GFP_...); if (!name) { btrfs_free_device(device); return ERR_PTR(-ENOMEM); } rcu_assign_pointer(device->name, name); Except in add_missing_dev() for obvious reasons. This patch consolidates that repeating code into the btrfs_alloc_device() itself so that the parent function doesn't have to duplicate code. This consolidation also helps to review issues regarding RCU lock violation with device->name. Parent function device_list_add() and add_missing_dev() use GFP_NOFS for the allocation, whereas the rest of the parent functions use GFP_KERNEL, so bring the NOFS allocation context using memalloc_nofs_save() in the function device_list_add() and add_missing_dev() is already doing it. Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 10 +------ fs/btrfs/volumes.c | 68 +++++++++++++++++++----------------------- fs/btrfs/volumes.h | 4 +-- 3 files changed, 34 insertions(+), 48 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 84af2010fae2..9c4a8649a0f4 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -249,7 +249,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; - struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -293,19 +292,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, } - device = btrfs_alloc_device(NULL, &devid, NULL); + device = btrfs_alloc_device(NULL, &devid, NULL, device_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); ret = lookup_bdev(device_path, &device->devt); if (ret) goto error; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index acb97494bb52..49d6e79d4343 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -845,26 +845,23 @@ static noinline struct btrfs_device *device_list_add(const char *path, } if (!device) { + unsigned int nofs_flag; + if (fs_devices->opened) { mutex_unlock(&fs_devices->device_list_mutex); return ERR_PTR(-EBUSY); } + nofs_flag = memalloc_nofs_save(); device = btrfs_alloc_device(NULL, &devid, - disk_super->dev_item.uuid); + disk_super->dev_item.uuid, path); + memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) { mutex_unlock(&fs_devices->device_list_mutex); /* we can safely leave the fs_devices entry around */ return device; } - name = rcu_string_strdup(path, GFP_NOFS); - if (!name) { - btrfs_free_device(device); - mutex_unlock(&fs_devices->device_list_mutex); - return ERR_PTR(-ENOMEM); - } - rcu_assign_pointer(device->name, name); device->devt = path_devt; list_add_rcu(&device->dev_list, &fs_devices->devices); @@ -997,30 +994,22 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) fs_devices->total_devices = orig->total_devices; list_for_each_entry(orig_dev, &orig->devices, dev_list) { - struct rcu_string *name; + const char *dev_path = NULL; + + /* + * This is ok to do without RCU read locked because we hold the + * uuid mutex so nothing we touch in here is going to disappear. + */ + if (orig_dev->name) + dev_path = orig_dev->name->str; device = btrfs_alloc_device(NULL, &orig_dev->devid, - orig_dev->uuid); + orig_dev->uuid, dev_path); if (IS_ERR(device)) { ret = PTR_ERR(device); goto error; } - /* - * This is ok to do without rcu read locked because we hold the - * uuid mutex so nothing we touch in here is going to disappear. - */ - if (orig_dev->name) { - name = rcu_string_strdup(orig_dev->name->str, - GFP_KERNEL); - if (!name) { - btrfs_free_device(device); - ret = -ENOMEM; - goto error; - } - rcu_assign_pointer(device->name, name); - } - if (orig_dev->zone_info) { struct btrfs_zoned_device_info *zone_info; @@ -2604,7 +2593,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path struct btrfs_device *device; struct block_device *bdev; struct super_block *sb = fs_info->sb; - struct rcu_string *name; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_fs_devices *seed_devices; u64 orig_super_total_bytes; @@ -2645,20 +2633,13 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path } rcu_read_unlock(); - device = btrfs_alloc_device(fs_info, NULL, NULL); + device = btrfs_alloc_device(fs_info, NULL, NULL, device_path); if (IS_ERR(device)) { /* we can safely leave the fs_devices entry around */ ret = PTR_ERR(device); goto error; } - name = rcu_string_strdup(device_path, GFP_KERNEL); - if (!name) { - ret = -ENOMEM; - goto error_free_device; - } - rcu_assign_pointer(device->name, name); - device->fs_info = fs_info; device->bdev = bdev; ret = lookup_bdev(device_path, &device->devt); @@ -6998,8 +6979,9 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * always do NOFS because we use it in a lot of other GFP_KERNEL safe * places. */ + nofs_flag = memalloc_nofs_save(); - device = btrfs_alloc_device(NULL, &devid, dev_uuid); + device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL); memalloc_nofs_restore(nofs_flag); if (IS_ERR(device)) return device; @@ -7023,14 +7005,15 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices, * is generated. * @uuid: a pointer to UUID for this device. If NULL a new UUID * is generated. + * @path: a pointer to device path if available, NULL otherwise. * * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR() * on error. Returned struct is not linked onto any lists and must be * destroyed with btrfs_free_device. */ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid) + const u64 *devid, const u8 *uuid, + const char *path) { struct btrfs_device *dev; u64 tmp; @@ -7068,6 +7051,17 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, else generate_random_uuid(dev->uuid); + if (path) { + struct rcu_string *name; + + name = rcu_string_strdup(path, GFP_KERNEL); + if (!name) { + btrfs_free_device(dev); + return ERR_PTR(-ENOMEM); + } + rcu_assign_pointer(dev->name, name); + } + return dev; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b05124e62412..f2a152937cd4 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -649,8 +649,8 @@ int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info, struct btrfs_dev_lookup_args *args, const char *path); struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, - const u64 *devid, - const u8 *uuid); + const u64 *devid, const u8 *uuid, + const char *path); void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args); void btrfs_free_device(struct btrfs_device *device); int btrfs_rm_device(struct btrfs_fs_info *fs_info, From 789d6a3a876e32c23fc9633d5b372d02a5188f0e Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Sep 2022 13:32:50 +0800 Subject: [PATCH 209/249] btrfs: concentrate all tree block parentness check parameters into one structure There are several different tree block parentness check parameters used across several helpers: - level Mandatory - transid Under most cases it's mandatory, but there are several backref cases which skips this check. - owner_root - first_key Utilized by most top-down tree search routine. Otherwise can be skipped. Those four members are not always mandatory checks, and some of them are the same u64, which means if some arguments got swapped compiler will not catch it. Furthermore if we're going to further expand the parentness check, we need to modify quite some helpers just to add one more parameter. This patch will concentrate all these members into a structure called btrfs_tree_parent_check, and pass that structure for the following helpers: - btrfs_read_extent_buffer() - read_tree_block() Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/backref.c | 15 +++++++--- fs/btrfs/ctree.c | 28 +++++++++++------- fs/btrfs/disk-io.c | 63 +++++++++++++++++++++++------------------ fs/btrfs/disk-io.h | 36 ++++++++++++++++++++--- fs/btrfs/extent-tree.c | 12 +++++--- fs/btrfs/print-tree.c | 14 +++++---- fs/btrfs/qgroup.c | 18 +++++++++--- fs/btrfs/relocation.c | 11 +++++-- fs/btrfs/tree-log.c | 25 +++++++++++----- fs/btrfs/tree-mod-log.c | 9 ++++-- 10 files changed, 159 insertions(+), 72 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 430974cf3b96..55c072ba6747 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -840,6 +840,8 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, struct rb_node *node; while ((node = rb_first_cached(&tree->root))) { + struct btrfs_tree_parent_check check = { 0 }; + ref = rb_entry(node, struct prelim_ref, rbnode); rb_erase_cached(node, &tree->root); @@ -847,8 +849,10 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(ref->key_for_search.type); BUG_ON(!ref->wanted_disk_byte); - eb = read_tree_block(fs_info, ref->wanted_disk_byte, - ref->root_id, 0, ref->level - 1, NULL); + check.level = ref->level - 1; + check.owner_root = ref->root_id; + + eb = read_tree_block(fs_info, ref->wanted_disk_byte, &check); if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); @@ -1591,10 +1595,13 @@ again: if (ref->count && ref->parent) { if (!ctx->ignore_extent_item_pos && !ref->inode_list && ref->level == 0) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(ctx->fs_info, ref->parent, 0, - 0, ref->level, NULL); + check.level = ref->level; + + eb = read_tree_block(ctx->fs_info, ref->parent, + &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0acd85111cdf..f75e398d7b71 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -857,19 +857,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int slot) { int level = btrfs_header_level(parent); + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - struct btrfs_key first_key; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); BUG_ON(level == 0); - btrfs_node_key_to_cpu(parent, &first_key, slot); + check.level = level - 1; + check.transid = btrfs_node_ptr_generation(parent, slot); + check.owner_root = btrfs_header_owner(parent); + check.has_first_key = true; + btrfs_node_key_to_cpu(parent, &check.first_key, slot); + eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), - btrfs_header_owner(parent), - btrfs_node_ptr_generation(parent, slot), - level - 1, &first_key); + &check); if (IS_ERR(eb)) return eb; if (!extent_buffer_uptodate(eb)) { @@ -1428,10 +1431,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, const struct btrfs_key *key) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; u64 blocknr; u64 gen; struct extent_buffer *tmp; - struct btrfs_key first_key; int ret; int parent_level; bool unlock_up; @@ -1440,7 +1443,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); - btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot); + check.has_first_key = true; + check.level = parent_level - 1; + check.transid = gen; + check.owner_root = root->root_key.objectid; /* * If we need to read an extent buffer from disk and we are holding locks @@ -1462,7 +1469,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, * parents (shared tree blocks). */ if (btrfs_verify_level_key(tmp, - parent_level - 1, &first_key, gen)) { + parent_level - 1, &check.first_key, gen)) { free_extent_buffer(tmp); return -EUCLEAN; } @@ -1479,7 +1486,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, btrfs_unlock_up_safe(p, level + 1); /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + ret = btrfs_read_extent_buffer(tmp, &check); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); @@ -1509,8 +1516,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, - gen, parent_level - 1, &first_key); + tmp = read_tree_block(fs_info, blocknr, &check); if (IS_ERR(tmp)) { btrfs_release_path(p); return PTR_ERR(tmp); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e602e0b4c25d..2f944a7c70d5 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -259,13 +259,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. * - * @parent_transid: expected transid, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key of first slot, skip check if NULL + * @check: expected tree parentness check, see the comments of the + * structure for details. */ int btrfs_read_extent_buffer(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -275,16 +273,19 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, int mirror_num = 0; int failed_mirror = 0; + ASSERT(check); + io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); if (!ret) { - if (verify_parent_transid(io_tree, eb, - parent_transid, 0)) + if (verify_parent_transid(io_tree, eb, check->transid, 0)) ret = -EIO; - else if (btrfs_verify_level_key(eb, level, - first_key, parent_transid)) + else if (btrfs_verify_level_key(eb, check->level, + check->has_first_key ? + &check->first_key : NULL, + check->transid)) ret = -EUCLEAN; else break; @@ -936,28 +937,28 @@ struct extent_buffer *btrfs_find_create_tree_block( * Read tree block at logical address @bytenr and do variant basic but critical * verification. * - * @owner_root: the objectid of the root owner for this block. - * @parent_transid: expected transid of this tree block, skip check if 0 - * @level: expected level, mandatory check - * @first_key: expected key in slot 0, skip check if NULL + * @check: expected tree parentness check, see comments of the + * structure for details. */ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key) + struct btrfs_tree_parent_check *check) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level); + ASSERT(check); + + buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root, + check->level); if (IS_ERR(buf)) return buf; - ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); + ret = btrfs_read_extent_buffer(buf, check); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } - if (btrfs_check_eb_owner(buf, owner_root)) { + if (btrfs_check_eb_owner(buf, check->owner_root)) { free_extent_buffer_stale(buf); return ERR_PTR(-EUCLEAN); } @@ -1373,6 +1374,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, struct btrfs_key *key) { struct btrfs_root *root; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_fs_info *fs_info = tree_root->fs_info; u64 generation; int ret; @@ -1392,9 +1394,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); level = btrfs_root_level(&root->root_item); - root->node = read_tree_block(fs_info, - btrfs_root_bytenr(&root->root_item), - key->objectid, generation, level, NULL); + check.level = level; + check.transid = generation; + check.owner_root = key->objectid; + root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item), + &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; @@ -2367,6 +2371,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices) { int ret; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_root *log_tree_root; struct btrfs_super_block *disk_super = fs_info->super_copy; u64 bytenr = btrfs_super_log_root(disk_super); @@ -2382,10 +2387,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, if (!log_tree_root) return -ENOMEM; - log_tree_root->node = read_tree_block(fs_info, bytenr, - BTRFS_TREE_LOG_OBJECTID, - fs_info->generation + 1, level, - NULL); + check.level = level; + check.transid = fs_info->generation + 1; + check.owner_root = BTRFS_TREE_LOG_OBJECTID; + log_tree_root->node = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(log_tree_root->node)) { btrfs_warn(fs_info, "failed to read log tree"); ret = PTR_ERR(log_tree_root->node); @@ -2863,10 +2868,14 @@ out: static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen, + .owner_root = root->root_key.objectid + }; int ret = 0; - root->node = read_tree_block(root->fs_info, bytenr, - root->root_key.objectid, gen, level, NULL); + root->node = read_tree_block(root->fs_info, bytenr, &check); if (IS_ERR(root->node)) { ret = PTR_ERR(root->node); root->node = NULL; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index f2c507fd0e04..03fe4154ffb8 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,6 +25,35 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; + struct btrfs_device; struct btrfs_fs_devices; @@ -33,8 +62,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); int btrfs_verify_level_key(struct extent_buffer *eb, int level, struct btrfs_key *first_key, u64 parent_transid); struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, - u64 owner_root, u64 parent_transid, - int level, struct btrfs_key *first_key); + struct btrfs_tree_parent_check *check); struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, @@ -111,8 +139,8 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, - int level, struct btrfs_key *first_key); +int btrfs_read_extent_buffer(struct extent_buffer *buf, + struct btrfs_tree_parent_check *check); enum btrfs_wq_submit_cmd { WQ_SUBMIT_METADATA, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b037107678c8..10cc757a602b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5261,8 +5261,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; - struct btrfs_key first_key; struct btrfs_ref ref = { 0 }; struct extent_buffer *next; int level = wc->level; @@ -5284,7 +5284,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - btrfs_node_key_to_cpu(path->nodes[level], &first_key, + + check.level = level - 1; + check.transid = generation; + check.owner_root = root->root_key.objectid; + check.has_first_key = true; + btrfs_node_key_to_cpu(path->nodes[level], &check.first_key, path->slots[level]); next = find_extent_buffer(fs_info, bytenr); @@ -5346,8 +5351,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(fs_info, bytenr, root->root_key.objectid, - generation, level - 1, &first_key); + next = read_tree_block(fs_info, bytenr, &check); if (IS_ERR(next)) { return PTR_ERR(next); } else if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1a2350fd68be..1469aa55ad48 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -386,14 +386,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow) if (!follow) return; for (i = 0; i < nr; i++) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { + .level = level - 1, + .transid = btrfs_node_ptr_generation(c, i), + .owner_root = btrfs_header_owner(c), + .has_first_key = true + }; struct extent_buffer *next; - btrfs_node_key_to_cpu(c, &first_key, i); - next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), - btrfs_header_owner(c), - btrfs_node_ptr_generation(c, i), - level - 1, &first_key); + btrfs_node_key_to_cpu(c, &check.first_key, i); + next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); if (IS_ERR(next)) continue; if (!extent_buffer_uptodate(next)) { diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 24c013c61a94..e0522c6c0d67 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2339,7 +2339,13 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, } if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); + struct btrfs_tree_parent_check check = { + .has_first_key = false, + .transid = root_gen, + .level = root_level + }; + + ret = btrfs_read_extent_buffer(root_eb, &check); if (ret) goto out; } @@ -4303,6 +4309,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct extent_buffer *subvol_eb) { struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_tree_parent_check check = { 0 }; struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks; struct btrfs_qgroup_swapped_block *block; struct extent_buffer *reloc_eb = NULL; @@ -4351,10 +4358,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, blocks->swapped = swapped; spin_unlock(&blocks->lock); + check.level = block->level; + check.transid = block->reloc_generation; + check.has_first_key = true; + memcpy(&check.first_key, &block->first_key, sizeof(check.first_key)); + /* Read out reloc subtree root */ - reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0, - block->reloc_generation, block->level, - &block->first_key); + reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check); if (IS_ERR(reloc_eb)) { ret = PTR_ERR(reloc_eb); reloc_eb = NULL; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8914aa920bb7..56c8afa6f6d2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2610,10 +2610,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) static int get_tree_block_key(struct btrfs_fs_info *fs_info, struct tree_block *block) { + struct btrfs_tree_parent_check check = { + .level = block->level, + .owner_root = block->owner, + .transid = block->key.offset + }; struct extent_buffer *eb; - eb = read_tree_block(fs_info, block->bytenr, block->owner, - block->key.offset, block->level, NULL); + eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); if (!extent_buffer_uptodate(eb)) { @@ -3426,9 +3430,10 @@ int add_data_references(struct reloc_control *rc, ULIST_ITER_INIT(&leaf_uiter); while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) { + struct btrfs_tree_parent_check check = { 0 }; struct extent_buffer *eb; - eb = read_tree_block(ctx.fs_info, ref_node->val, 0, 0, 0, NULL); + eb = read_tree_block(ctx.fs_info, ref_node->val, &check); if (IS_ERR(eb)) { ret = PTR_ERR(eb); break; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f7b1bb9c63e4..4d9f6803bfbe 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -341,7 +341,12 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + struct btrfs_tree_parent_check check = { + .level = level, + .transid = gen + }; + + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; } @@ -2411,13 +2416,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, struct walk_control *wc, u64 gen, int level) { int nritems; + struct btrfs_tree_parent_check check = { + .transid = gen, + .level = level + }; struct btrfs_path *path; struct btrfs_root *root = wc->replay_dest; struct btrfs_key key; int i; int ret; - ret = btrfs_read_extent_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, &check); if (ret) return ret; @@ -2597,7 +2606,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, int ret = 0; while (*level > 0) { - struct btrfs_key first_key; + struct btrfs_tree_parent_check check = { 0 }; cur = path->nodes[*level]; @@ -2609,7 +2618,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]); + check.transid = ptr_gen; + check.level = *level - 1; + check.has_first_key = true; + btrfs_node_key_to_cpu(cur, &check.first_key, path->slots[*level]); blocksize = fs_info->nodesize; next = btrfs_find_create_tree_block(fs_info, bytenr, @@ -2628,8 +2640,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_extent_buffer(next, ptr_gen, - *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; @@ -2657,7 +2668,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, &check); if (ret) { free_extent_buffer(next); return ret; diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 3bea19698a10..779ad44d285f 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -821,10 +821,15 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) tm = tree_mod_log_search(fs_info, logical, time_seq); if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + struct btrfs_tree_parent_check check = { 0 }; + btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); + + check.level = level; + check.owner_root = root->root_key.objectid; + + old = read_tree_block(fs_info, logical, &check); if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { if (!IS_ERR(old)) free_extent_buffer(old); From 947a629988f191807d2d22ba63ae18259bb645c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 14 Sep 2022 13:32:51 +0800 Subject: [PATCH 210/249] btrfs: move tree block parentness check into validate_extent_buffer() [BACKGROUND] Although both btrfs metadata and data has their read time verification done at endio time (btrfs_validate_metadata_buffer() and btrfs_verify_data_csum()), metadata has extra verification, mostly parentness check including first key/transid/owner_root/level, done at read_tree_block() and btrfs_read_extent_buffer(). On the other hand, all the data verification is done at endio context. [ENHANCEMENT] This patch will make a new union in btrfs_bio, taking the space of the old data checksums, thus it will not increase the memory usage. With that extra btrfs_tree_parent_check inside btrfs_bio, we can just pass the check parameter into read_extent_buffer_pages(), and before submitting the bio, we can copy the check structure into btrfs_bio. And finally at endio time, we can grab btrfs_bio::parent_check and pass it to validate_extent_buffer(), to move the remaining checks into it. This brings the following benefits: - Much simpler btrfs_read_extent_buffer() Now it only needs to iterate through all mirrors. - Simpler read-time transid check Previously we go verify_parent_transid() after reading out the extent buffer. Now the transid check is done inside the endio function, no other code can modify the content. Thus no need to use the extent lock anymore. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 73 ++++++++++++++++++++++++++++++++------------ fs/btrfs/extent_io.c | 18 ++++++++--- fs/btrfs/extent_io.h | 5 +-- fs/btrfs/volumes.h | 25 +++++++++++++-- 4 files changed, 93 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2f944a7c70d5..559e7f3d727e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -266,7 +266,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct extent_io_tree *io_tree; int failed = 0; int ret; int num_copies = 0; @@ -275,21 +274,11 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); - io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; while (1) { clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); - ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num); - if (!ret) { - if (verify_parent_transid(io_tree, eb, check->transid, 0)) - ret = -EIO; - else if (btrfs_verify_level_key(eb, check->level, - check->has_first_key ? - &check->first_key : NULL, - check->transid)) - ret = -EUCLEAN; - else - break; - } + ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check); + if (!ret) + break; num_copies = btrfs_num_copies(fs_info, eb->start, eb->len); @@ -465,7 +454,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb) } /* Do basic extent buffer checks at read time */ -static int validate_extent_buffer(struct extent_buffer *eb) +static int validate_extent_buffer(struct extent_buffer *eb, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; u64 found_start; @@ -475,6 +465,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) const u8 *header_csum; int ret = 0; + ASSERT(check); + found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { btrfs_err_rl(fs_info, @@ -513,6 +505,45 @@ static int validate_extent_buffer(struct extent_buffer *eb) goto out; } + if (found_level != check->level) { + ret = -EIO; + goto out; + } + if (unlikely(check->transid && + btrfs_header_generation(eb) != check->transid)) { + btrfs_err_rl(eb->fs_info, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, check->transid, + btrfs_header_generation(eb)); + ret = -EIO; + goto out; + } + if (check->has_first_key) { + struct btrfs_key *expect_key = &check->first_key; + struct btrfs_key found_key; + + if (found_level) + btrfs_node_key_to_cpu(eb, &found_key, 0); + else + btrfs_item_key_to_cpu(eb, &found_key, 0); + if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) { + btrfs_err(fs_info, +"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", + eb->start, check->transid, + expect_key->objectid, + expect_key->type, expect_key->offset, + found_key.objectid, found_key.type, + found_key.offset); + ret = -EUCLEAN; + goto out; + } + } + if (check->owner_root) { + ret = btrfs_check_eb_owner(eb, check->owner_root); + if (ret < 0) + goto out; + } + /* * If this is a leaf block and it is corrupt, set the corrupt bit so * that we don't try and read the other copies of this block, just @@ -537,13 +568,15 @@ out: } static int validate_subpage_buffer(struct page *page, u64 start, u64 end, - int mirror) + int mirror, struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct extent_buffer *eb; bool reads_done; int ret = 0; + ASSERT(check); + /* * We don't allow bio merge for subpage metadata read, so we should * only get one eb for each endio hook. @@ -567,7 +600,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, check); if (ret < 0) goto err; @@ -597,7 +630,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) - return validate_subpage_buffer(page, start, end, mirror); + return validate_subpage_buffer(page, start, end, mirror, + &bbio->parent_check); eb = (struct extent_buffer *)page->private; @@ -616,7 +650,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ret = -EIO; goto err; } - ret = validate_extent_buffer(eb); + ret = validate_extent_buffer(eb, &bbio->parent_check); err: if (ret) { /* @@ -774,6 +808,7 @@ void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int m blk_status_t ret; bio->bi_opf |= REQ_META; + bbio->is_metadata = 1; if (btrfs_op(bio) != BTRFS_MAP_WRITE) { btrfs_submit_bio(fs_info, bio, mirror_num); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 859a41624c31..d257879edaba 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4934,7 +4934,8 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) } static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, - int mirror_num) + int mirror_num, + struct btrfs_tree_parent_check *check) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -4947,6 +4948,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); ASSERT(PagePrivate(page)); + ASSERT(check); io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree; if (wait == WAIT_NONE) { @@ -4990,6 +4992,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) { free_extent_state(cached_state); @@ -5003,7 +5006,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, return ret; } -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *check) { int i; struct page *page; @@ -5029,7 +5033,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) return -EIO; if (eb->fs_info->nodesize < PAGE_SIZE) - return read_extent_buffer_subpage(eb, wait, mirror_num); + return read_extent_buffer_subpage(eb, wait, mirror_num, check); num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { @@ -5106,6 +5110,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } + memcpy(&btrfs_bio(bio_ctrl.bio)->parent_check, check, sizeof(*check)); submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) @@ -5841,6 +5846,11 @@ int try_release_extent_buffer(struct page *page) void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level) { + struct btrfs_tree_parent_check check = { + .has_first_key = 0, + .level = level, + .transid = gen + }; struct extent_buffer *eb; int ret; @@ -5853,7 +5863,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, return; } - ret = read_extent_buffer_pages(eb, WAIT_NONE, 0); + ret = read_extent_buffer_pages(eb, WAIT_NONE, 0, &check); if (ret < 0) free_extent_buffer_stale(eb); else diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 805e262811b4..a0bafc7f6c07 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -66,6 +66,7 @@ struct btrfs_inode; struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; +struct btrfs_tree_parent_check; int __init extent_buffer_init_cachep(void); void __cold extent_buffer_free_cachep(void); @@ -170,8 +171,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb); #define WAIT_NONE 0 #define WAIT_COMPLETE 1 #define WAIT_PAGE_LOCK 2 -int read_extent_buffer_pages(struct extent_buffer *eb, int wait, - int mirror_num); +int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, + struct btrfs_tree_parent_check *parent_check); void wait_on_extent_buffer_writeback(struct extent_buffer *eb); void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index f2a152937cd4..efa6a3d48cd8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -11,6 +11,7 @@ #include #include "async-thread.h" #include "messages.h" +#include "disk-io.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -397,7 +398,15 @@ typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); * Mostly for btrfs specific features like csum and mirror_num. */ struct btrfs_bio { - unsigned int mirror_num; + unsigned int mirror_num:7; + + /* + * Extra indicator for metadata bios. + * For some btrfs bios they use pages without a mapping, thus + * we can not rely on page->mapping->host to determine if + * it's a metadata bio. + */ + unsigned int is_metadata:1; struct bvec_iter iter; /* for direct I/O */ @@ -405,8 +414,16 @@ struct btrfs_bio { /* @device is for stripe IO submission. */ struct btrfs_device *device; - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + union { + /* For data checksum verification. */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + }; + + /* For metadata parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; /* End I/O information supplied to btrfs_bio_alloc */ btrfs_bio_end_io_t end_io; @@ -443,6 +460,8 @@ static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) { + if (bbio->is_metadata) + return; if (bbio->csum != bbio->csum_inline) { kfree(bbio->csum); bbio->csum = NULL; From 2c8f5e8cdf0f77670b1a9f72156ad4e82ed323d1 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:27 +0000 Subject: [PATCH 211/249] btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree We don't need to set the EXTENT_UPDATE bit in an inode's io_tree to mark a range as uptodate, we rely on the pages themselves being uptodate - page reading is not triggered for already uptodate pages. Recently we removed most use of the EXTENT_UPTODATE for buffered IO with commit 52b029f42751 ("btrfs: remove unnecessary EXTENT_UPTODATE state in buffered I/O path"), but there were a few leftovers, namely when reading from holes and successfully finishing read repair. These leftovers are unnecessarily making an inode's tree larger and deeper, slowing down searches on it. So remove all the leftovers. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 7 ------- fs/btrfs/extent_io.c | 19 +++---------------- 2 files changed, 3 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index cdee8c0854c8..18ab82f62611 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -216,13 +216,6 @@ static inline int set_extent_new(struct extent_io_tree *tree, u64 start, return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS); } -static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, mask); -} - int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits, struct extent_state **cached_state); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d257879edaba..33cc3a9a2b68 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -901,14 +901,9 @@ static void end_sector_io(struct page *page, u64 offset, bool uptodate) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); const u32 sectorsize = inode->root->fs_info->sectorsize; - struct extent_state *cached = NULL; end_page_read(page, uptodate, offset, sectorsize); - if (uptodate) - set_extent_uptodate(&inode->io_tree, offset, - offset + sectorsize - 1, &cached, GFP_NOFS); - unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, - &cached); + unlock_extent(&inode->io_tree, offset, offset + sectorsize - 1, NULL); } static void submit_data_read_repair(struct inode *inode, @@ -1781,13 +1776,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - struct extent_state *cached = NULL; - iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); break; } @@ -1863,13 +1854,9 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - struct extent_state *cached = NULL; - memzero_page(page, pg_offset, iosize); - set_extent_uptodate(tree, cur, cur + iosize - 1, - &cached, GFP_NOFS); - unlock_extent(tree, cur, cur + iosize - 1, &cached); + unlock_extent(tree, cur, cur + iosize - 1, NULL); end_page_read(page, true, cur, iosize); cur = cur + iosize; pg_offset += iosize; From 40daf3e095dbd1059d7e600d27e5bb987f563561 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:28 +0000 Subject: [PATCH 212/249] btrfs: add an early exit when searching for delalloc range for lseek/fiemap During fiemap and lseek (SEEK_HOLE/DATA), when looking for delalloc in a range corresponding to a hole or a prealloc extent, if we found the whole range marked as delalloc in the inode's io_tree, then we can terminate immediately and avoid searching the extent map tree. If not, and if the found delalloc starts at the same offset of our search start but ends before our search range's end, then we can adjust the search range for the search in the extent map tree. So implement those changes. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index e6c93be91a06..9b1f76109682 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3216,7 +3216,7 @@ out: static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { - const u64 len = end + 1 - start; + u64 len = end + 1 - start; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; u64 em_end; @@ -3242,13 +3242,23 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end delalloc_len = 0; } - /* - * If delalloc was found then *delalloc_start_ret has a sector size - * aligned value (rounded down). - */ - if (delalloc_len > 0) + if (delalloc_len > 0) { + /* + * If delalloc was found then *delalloc_start_ret has a sector size + * aligned value (rounded down). + */ *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1; + if (*delalloc_start_ret == start) { + /* Delalloc for the whole range, nothing more to do. */ + if (*delalloc_end_ret == end) + return true; + /* Else trim our search range for extent maps. */ + start = *delalloc_end_ret + 1; + len = end + 1 - start; + } + } + /* * No outstanding extents means we don't have any delalloc that is * flushing, so return the unflushed range found in the io tree (if any). From af979fd618a40009c5cae03de21403848b13a931 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:29 +0000 Subject: [PATCH 213/249] btrfs: skip unnecessary delalloc searches during lseek/fiemap During lseek (SEEK_HOLE/DATA) and fiemap, when processing a file range that corresponds to a hole or a prealloc extent, if we find that there is no delalloc marked in the inode's io_tree but there is delalloc due to an extent map in the io tree, then on the next iteration that calls find_delalloc_subrange() we can skip searching the io tree again, since on the first call we had no delalloc in the io tree for the whole range. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 9b1f76109682..99cc95487d42 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3214,6 +3214,7 @@ out: * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 len = end + 1 - start; @@ -3231,7 +3232,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end spin_lock(&inode->lock); outstanding_extents = inode->outstanding_extents; - if (inode->delalloc_bytes > 0) { + if (*search_io_tree && inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; delalloc_len = count_range_bits(&inode->io_tree, @@ -3257,6 +3258,9 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end start = *delalloc_end_ret + 1; len = end + 1 - start; } + } else { + /* No delalloc, future calls don't need to search again. */ + *search_io_tree = false; } /* @@ -3390,6 +3394,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); u64 prev_delalloc_end = 0; + bool search_io_tree = true; bool ret = false; while (cur_offset < end) { @@ -3398,6 +3403,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, + &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) From 8ddc8274e4be9a35eafc5bef9ff64d94f452d193 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:30 +0000 Subject: [PATCH 214/249] btrfs: search for delalloc more efficiently during lseek/fiemap During lseek (SEEK_HOLE/DATA) and fiemap, when processing a file range that corresponds to a hole or a prealloc extent, we have to check if there's any delalloc in the range. We do it by searching for delalloc ranges in the inode's io_tree (for unflushed delalloc) and in the inode's extent map tree (for delalloc that is flushing). We avoid searching the extent map tree if the number of outstanding extents is 0, as in that case we can't have extent maps for our search range in the tree that correspond to delalloc that is flushing. However if we have any unflushed delalloc, due to buffered writes or mmap writes, then the outstanding extents counter is not 0 and we'll search the extent map tree. The tree may be large because it can have lots of extent maps that were loaded by reads or created by previous writes, therefore taking a significant time to search the tree, specially if have a file with a lot of holes and/or prealloc extents. We can improve on this by instead of searching the extent map tree, searching the ordered extents tree of the inode, since when delalloc is flushing we create an ordered extent along with the new extent map, while holding the respective file range locked in the inode's io_tree. The ordered extents tree is typically much smaller, since ordered extents have a short life and get removed from the tree once they are completed, while extent maps can stay for a very long time in the extent map tree, either created by previous writes or loaded by read operations. So use the ordered extents tree instead of the extent maps tree. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/file.c | 152 +++++++++++++++--------------------------------- 1 file changed, 48 insertions(+), 104 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 99cc95487d42..6bc2397e324c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3218,29 +3218,27 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 len = end + 1 - start; - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - u64 em_end; - u64 delalloc_len; - unsigned int outstanding_extents; + u64 delalloc_len = 0; + struct btrfs_ordered_extent *oe; + u64 oe_start; + u64 oe_end; /* * Search the io tree first for EXTENT_DELALLOC. If we find any, it * means we have delalloc (dirty pages) for which writeback has not * started yet. */ - spin_lock(&inode->lock); - outstanding_extents = inode->outstanding_extents; - - if (*search_io_tree && inode->delalloc_bytes > 0) { - spin_unlock(&inode->lock); - *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); - } else { - spin_unlock(&inode->lock); - delalloc_len = 0; + if (*search_io_tree) { + spin_lock(&inode->lock); + if (inode->delalloc_bytes > 0) { + spin_unlock(&inode->lock); + *delalloc_start_ret = start; + delalloc_len = count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1); + } else { + spin_unlock(&inode->lock); + } } if (delalloc_len > 0) { @@ -3254,7 +3252,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end /* Delalloc for the whole range, nothing more to do. */ if (*delalloc_end_ret == end) return true; - /* Else trim our search range for extent maps. */ + /* Else trim our search range for ordered extents. */ start = *delalloc_end_ret + 1; len = end + 1 - start; } @@ -3264,110 +3262,56 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end } /* - * No outstanding extents means we don't have any delalloc that is - * flushing, so return the unflushed range found in the io tree (if any). - */ - if (outstanding_extents == 0) - return (delalloc_len > 0); - - /* - * Now also check if there's any extent map in the range that does not - * map to a hole or prealloc extent. We do this because: + * Now also check if there's any ordered extent in the range. + * We do this because: * * 1) When delalloc is flushed, the file range is locked, we clear the - * EXTENT_DELALLOC bit from the io tree and create an extent map for - * an allocated extent. So we might just have been called after - * delalloc is flushed and before the ordered extent completes and - * inserts the new file extent item in the subvolume's btree; + * EXTENT_DELALLOC bit from the io tree and create an extent map and + * an ordered extent for the write. So we might just have been called + * after delalloc is flushed and before the ordered extent completes + * and inserts the new file extent item in the subvolume's btree; * - * 2) We may have an extent map created by flushing delalloc for a + * 2) We may have an ordered extent created by flushing delalloc for a * subrange that starts before the subrange we found marked with * EXTENT_DELALLOC in the io tree. + * + * We could also use the extent map tree to find such delalloc that is + * being flushed, but using the ordered extents tree is more efficient + * because it's usually much smaller as ordered extents are removed from + * the tree once they complete. With the extent maps, we mau have them + * in the extent map tree for a very long time, and they were either + * created by previous writes or loaded by read operations. */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) { - read_unlock(&em_tree->lock); + oe = btrfs_lookup_first_ordered_range(inode, start, len); + if (!oe) return (delalloc_len > 0); - } - /* extent_map_end() returns a non-inclusive end offset. */ - em_end = extent_map_end(em); + /* The ordered extent may span beyond our search range. */ + oe_start = max(oe->file_offset, start); + oe_end = min(oe->file_offset + oe->num_bytes - 1, end); - /* - * If we have a hole/prealloc extent map, check the next one if this one - * ends before our range's end. - */ - if ((em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) { - struct extent_map *next_em; + btrfs_put_ordered_extent(oe); - next_em = btrfs_next_extent_map(em_tree, em); - free_extent_map(em); - - /* - * There's no next extent map or the next one starts beyond our - * range, return the range found in the io tree (if any). - */ - if (!next_em || next_em->start > end) { - read_unlock(&em_tree->lock); - free_extent_map(next_em); - return (delalloc_len > 0); - } - - em_end = extent_map_end(next_em); - em = next_em; - } - - read_unlock(&em_tree->lock); - - /* - * We have a hole or prealloc extent that ends at or beyond our range's - * end, return the range found in the io tree (if any). - */ - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - free_extent_map(em); - return (delalloc_len > 0); - } - - /* - * We don't have any range as EXTENT_DELALLOC in the io tree, so the - * extent map is the only subrange representing delalloc. - */ + /* Don't have unflushed delalloc, return the ordered extent range. */ if (delalloc_len == 0) { - *delalloc_start_ret = em->start; - *delalloc_end_ret = min(end, em_end - 1); - free_extent_map(em); + *delalloc_start_ret = oe_start; + *delalloc_end_ret = oe_end; return true; } /* - * The extent map represents a delalloc range that starts before the - * delalloc range we found in the io tree. + * We have both unflushed delalloc (io_tree) and an ordered extent. + * If the ranges are adjacent returned a combined range, otherwise + * return the leftmost range. */ - if (em->start < *delalloc_start_ret) { - *delalloc_start_ret = em->start; - /* - * If the ranges are adjacent, return a combined range. - * Otherwise return the extent map's range. - */ - if (em_end < *delalloc_start_ret) - *delalloc_end_ret = min(end, em_end - 1); - - free_extent_map(em); - return true; + if (oe_start < *delalloc_start_ret) { + if (oe_end < *delalloc_start_ret) + *delalloc_end_ret = oe_end; + *delalloc_start_ret = oe_start; + } else if (*delalloc_end_ret + 1 == oe_start) { + *delalloc_end_ret = oe_end; } - /* - * The extent map starts after the delalloc range we found in the io - * tree. If it's adjacent, return a combined range, otherwise return - * the range found in the io tree. - */ - if (*delalloc_end_ret + 1 == em->start) - *delalloc_end_ret = min(end, em_end - 1); - - free_extent_map(em); return true; } From cfd7a17d9b4588dd7a29e1a131257bee3e72b766 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:31 +0000 Subject: [PATCH 215/249] btrfs: remove no longer used btrfs_next_extent_map() There are no more users of btrfs_next_extent_map(), the previous patch in the series ("btrfs: search for delalloc more efficiently during lseek/fiemap") removed the last usage of the function, so delete it. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 29 ----------------------------- fs/btrfs/extent_map.h | 2 -- 2 files changed, 31 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 4a4362f5cc52..be94030e1dfb 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -529,35 +529,6 @@ static struct extent_map *next_extent_map(const struct extent_map *em) return container_of(next, struct extent_map, rb_node); } -/* - * Get the extent map that immediately follows another one. - * - * @tree: The extent map tree that the extent map belong to. - * Holding read or write access on the tree's lock is required. - * @em: An extent map from the given tree. The caller must ensure that - * between getting @em and between calling this function, the - * extent map @em is not removed from the tree - for example, by - * holding the tree's lock for the duration of those 2 operations. - * - * Returns the extent map that immediately follows @em, or NULL if @em is the - * last extent map in the tree. - */ -struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, - const struct extent_map *em) -{ - struct extent_map *next; - - /* The lock must be acquired either in read mode or write mode. */ - lockdep_assert_held(&tree->lock); - ASSERT(extent_map_in_tree(em)); - - next = next_extent_map(em); - if (next) - refcount_inc(&next->refs); - - return next; -} - static struct extent_map *prev_extent_map(struct extent_map *em) { struct rb_node *prev; diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 68d3f2c9ea1d..ad311864272a 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -87,8 +87,6 @@ static inline u64 extent_map_block_end(struct extent_map *em) void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -struct extent_map *btrfs_next_extent_map(const struct extent_map_tree *tree, - const struct extent_map *em); int add_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); From 8c6e53a79d16b3651ad3abeb415e1c637da75082 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:32 +0000 Subject: [PATCH 216/249] btrfs: allow passing a cached state record to count_range_bits() An inode's io_tree can be quite large and there are cases where due to delalloc it can have thousands of extent state records, which makes the red black tree have a depth of 10 or more, making the operation of count_range_bits() slow if we repeatedly call it for a range that starts where, or after, the previous one we called it for. Such use cases are when searching for delalloc in a file range that corresponds to a hole or a prealloc extent, which is done during lseek SEEK_HOLE/DATA and fiemap. So introduce a cached state parameter to count_range_bits() which we use to store the last extent state record we visited, and then allow the caller to pass it again on its next call to count_range_bits(). The next patches in the series will make fiemap and lseek use the new parameter. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 47 ++++++++++++++++++++++++++++++++++++--- fs/btrfs/extent-io-tree.h | 3 ++- fs/btrfs/file.c | 3 ++- fs/btrfs/inode.c | 2 +- 4 files changed, 49 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 285b0ff6e953..6b0d78df7eee 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1521,9 +1521,11 @@ out: */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig) + u32 bits, int contig, + struct extent_state **cached_state) { - struct extent_state *state; + struct extent_state *state = NULL; + struct extent_state *cached; u64 cur_start = *start; u64 total_bytes = 0; u64 last = 0; @@ -1534,11 +1536,41 @@ u64 count_range_bits(struct extent_io_tree *tree, spin_lock(&tree->lock); + if (!cached_state || !*cached_state) + goto search; + + cached = *cached_state; + + if (!extent_state_in_tree(cached)) + goto search; + + if (cached->start <= cur_start && cur_start <= cached->end) { + state = cached; + } else if (cached->start > cur_start) { + struct extent_state *prev; + + /* + * The cached state starts after our search range's start. Check + * if the previous state record starts at or before the range we + * are looking for, and if so, use it - this is a common case + * when there are holes between records in the tree. If there is + * no previous state record, we can start from our cached state. + */ + prev = prev_state(cached); + if (!prev) + state = cached; + else if (prev->start <= cur_start && cur_start <= prev->end) + state = prev; + } + /* * This search will find all the extents that end after our range * starts. */ - state = tree_search(tree, cur_start); +search: + if (!state) + state = tree_search(tree, cur_start); + while (state) { if (state->start > search_end) break; @@ -1559,7 +1591,16 @@ u64 count_range_bits(struct extent_io_tree *tree, } state = next_state(state); } + + if (cached_state) { + free_extent_state(*cached_state); + *cached_state = state; + if (state) + refcount_inc(&state->refs); + } + spin_unlock(&tree->lock); + return total_bytes; } diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 18ab82f62611..e3eeec380844 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -119,7 +119,8 @@ void __cold extent_state_free_cachep(void); u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig); + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 6bc2397e324c..dc8399610ca3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3235,7 +3235,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end *delalloc_start_ret = start; delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1); + len, EXTENT_DELALLOC, 1, + NULL); } else { spin_unlock(&inode->lock); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aec1b232a71c..83898bca39d5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1769,7 +1769,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, * when starting writeback. */ count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0); + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; From 1ee51a06255d425c1b7effff095f0bf9c7240078 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:33 +0000 Subject: [PATCH 217/249] btrfs: update stale comment for count_range_bits() The comment for count_range_bits() mentions that the search is fast if we are asking for a range with the EXTENT_DIRTY bit set. However that is no longer true since we don't use that bit and the optimization for that was removed in: commit 71528e9e16c7 ("btrfs: get rid of extent_io_tree::dirty_bytes") So remove that part of the comment mentioning the no longer existing optimized case, and, while at it, add proper documentation describing the purpose, arguments and return value of the function. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6b0d78df7eee..21fa15123af8 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1515,9 +1515,29 @@ out: } /* - * Count the number of bytes in the tree that have a given bit(s) set. This - * can be fairly slow, except for EXTENT_DIRTY which is cached. The total - * number found is returned. + * Count the number of bytes in the tree that have a given bit(s) set for a + * given range. + * + * @tree: The io tree to search. + * @start: The start offset of the range. This value is updated to the + * offset of the first byte found with the given bit(s), so it + * can end up being bigger than the initial value. + * @search_end: The end offset (inclusive value) of the search range. + * @max_bytes: The maximum byte count we are interested. The search stops + * once it reaches this count. + * @bits: The bits the range must have in order to be accounted for. + * If multiple bits are set, then only subranges that have all + * the bits set are accounted for. + * @contig: Indicate if we should ignore holes in the range or not. If + * this is true, then stop once we find a hole. + * @cached_state: A cached state to be used across multiple calls to this + * function in order to speedup searches. Use NULL if this is + * called only once or if each call does not start where the + * previous one ended. + * + * Returns the total number of bytes found within the given range that have + * all given bits set. If the returned number of bytes is greater than zero + * then @start is updated with the offset of the first byte with the bits set. */ u64 count_range_bits(struct extent_io_tree *tree, u64 *start, u64 search_end, u64 max_bytes, From b3e744fe6d289bec77b138b6201201e4148dcb0e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:34 +0000 Subject: [PATCH 218/249] btrfs: use cached state when looking for delalloc ranges with fiemap During fiemap, whenever we find a hole or prealloc extent, we will look for delalloc in that range, and one of the things we do for that is to find out ranges in the inode's io_tree marked with EXTENT_DELALLOC, using calls to count_range_bits(). Since we process file extents from left to right, if we have a file with several holes or prealloc extents, we benefit from keeping a cached extent state record for calls to count_range_bits(). Most of the time the last extent state record we visited in one call to count_range_bits() matches the first extent state record we will use in the next call to count_range_bits(), so there's a benefit here. So use an extent state record to cache results from count_range_bits() calls during fiemap. This change is part of a patchset that has the goal to make performance better for applications that use lseek's SEEK_HOLE and SEEK_DATA modes to iterate over the extents of a file. Two examples are the cp program from coreutils 9.0+ and the tar program (when using its --sparse / -S option). A sample test and results are listed in the changelog of the last patch in the series: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 ++++++++++- fs/btrfs/file.c | 10 +++++++--- fs/btrfs/file.h | 1 + 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33cc3a9a2b68..704ae7c08867 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3698,6 +3698,7 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path static int fiemap_process_hole(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, + struct extent_state **delalloc_cached_state, struct btrfs_backref_share_check_ctx *backref_ctx, u64 disk_bytenr, u64 extent_offset, u64 extent_gen, @@ -3722,6 +3723,7 @@ static int fiemap_process_hole(struct btrfs_inode *inode, bool delalloc; delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, + delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) @@ -3892,6 +3894,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, { const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; + struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; @@ -3966,6 +3969,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, const u64 range_end = min(key.offset, lockend) - 1; ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end); if (ret < 0) { @@ -4006,6 +4010,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_len, flags); } else if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, disk_bytenr, extent_offset, extent_gen, key.offset, @@ -4013,6 +4018,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } else if (disk_bytenr == 0) { /* We have an explicit hole. */ ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, 0, 0, 0, key.offset, extent_end - 1); } else { @@ -4070,7 +4076,8 @@ check_eof_delalloc: path = NULL; if (!stopped && prev_extent_end < lockend) { - ret = fiemap_process_hole(inode, fieinfo, &cache, backref_ctx, + ret = fiemap_process_hole(inode, fieinfo, &cache, + &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, lockend - 1); if (ret < 0) goto out_unlock; @@ -4088,6 +4095,7 @@ check_eof_delalloc: delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, + &delalloc_cached_state, &delalloc_start, &delalloc_end); if (!delalloc) @@ -4102,6 +4110,7 @@ check_eof_delalloc: out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); out: + free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); return ret; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index dc8399610ca3..da390f891220 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3214,6 +3214,7 @@ out: * looping while it gets adjacent subranges, and merging them together. */ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, bool *search_io_tree, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { @@ -3236,7 +3237,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end, len, EXTENT_DELALLOC, 1, - NULL); + cached_state); } else { spin_unlock(&inode->lock); } @@ -3324,6 +3325,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * sector size aligned. * @end: The end offset (inclusive value) of the search range. * It does not need to be sector size aligned. + * @cached_state: Extent state record used for speeding up delalloc + * searches in the inode's io_tree. Can be NULL. * @delalloc_start_ret: Output argument, set to the start offset of the * subrange found with delalloc (may not be sector size * aligned). @@ -3335,6 +3338,7 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end * end offsets of the subrange. */ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret) { u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize); @@ -3348,7 +3352,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, bool delalloc; delalloc = find_delalloc_subrange(inode, cur_offset, end, - &search_io_tree, + cached_state, &search_io_tree, &delalloc_start, &delalloc_end); if (!delalloc) @@ -3400,7 +3404,7 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, u64 delalloc_end; bool delalloc; - delalloc = btrfs_find_delalloc_in_range(inode, start, end, + delalloc = btrfs_find_delalloc_in_range(inode, start, end, NULL, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index f3d794e33d46..82b34fbb295f 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -27,6 +27,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, size_t *write_bytes, bool nowait); void btrfs_check_nocow_unlock(struct btrfs_inode *inode); bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state, u64 *delalloc_start_ret, u64 *delalloc_end_ret); #endif From 3c32c7212f1639471ec0197ff1179b8ef2e0f3d3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 11 Nov 2022 11:50:35 +0000 Subject: [PATCH 219/249] btrfs: use cached state when looking for delalloc ranges with lseek During lseek (SEEK_HOLE/DATA), whenever we find a hole or prealloc extent, we will look for delalloc in that range, and one of the things we do for that is to find out ranges in the inode's io_tree marked with EXTENT_DELALLOC, using calls to count_range_bits(). Typically there's a single, or few, searches in the io_tree for delalloc per lseek call. However it's common for applications to keep calling lseek with SEEK_HOLE and SEEK_DATA to find where extents and holes are in a file, read the extents and skip holes in order to avoid unnecessary IO and save disk space by preserving holes. One popular user is the cp utility from coreutils. Starting with coreutils 9.0, cp uses SEEK_HOLE and SEEK_DATA to iterate over the extents of a file. Before 9.0, it used fiemap to figure out where holes and extents are in the source file. Another popular user is the tar utility when used with the --sparse / -S option to detect and preserve holes. Given that the pattern is to keep calling lseek with a start offset that matches the returned offset from the previous lseek call, we can benefit from caching the last extent state visited in count_range_bits() and use it for the next count_range_bits() from the next lseek call. Example, the following strace excerpt from running tar: $ strace tar cJSvf foo.tar.xz qemu_disk_file.raw (...) lseek(5, 125019574272, SEEK_HOLE) = 125024989184 lseek(5, 125024989184, SEEK_DATA) = 125024993280 lseek(5, 125024993280, SEEK_HOLE) = 125025239040 lseek(5, 125025239040, SEEK_DATA) = 125025255424 lseek(5, 125025255424, SEEK_HOLE) = 125025353728 lseek(5, 125025353728, SEEK_DATA) = 125025357824 lseek(5, 125025357824, SEEK_HOLE) = 125026766848 lseek(5, 125026766848, SEEK_DATA) = 125026770944 lseek(5, 125026770944, SEEK_HOLE) = 125027053568 (...) Shows that pattern, which is the same as with cp from coreutils 9.0+. So start using a cached state for the delalloc searches in lseek, and store it in struct file's private data so that it can be reused across lseek calls. This change is part of a patchset that is comprised of the following patches: 1/9 btrfs: remove leftover setting of EXTENT_UPTODATE state in an inode's io_tree 2/9 btrfs: add an early exit when searching for delalloc range for lseek/fiemap 3/9 btrfs: skip unnecessary delalloc searches during lseek/fiemap 4/9 btrfs: search for delalloc more efficiently during lseek/fiemap 5/9 btrfs: remove no longer used btrfs_next_extent_map() 6/9 btrfs: allow passing a cached state record to count_range_bits() 7/9 btrfs: update stale comment for count_range_bits() 8/9 btrfs: use cached state when looking for delalloc ranges with fiemap 9/9 btrfs: use cached state when looking for delalloc ranges with lseek The following test was run before and after applying the whole patchset: $ cat test-cp.sh #!/bin/bash DEV=/dev/sdh MNT=/mnt/sdh # coreutils 8.32, cp uses fiemap to detect holes and extents #CP_PROG=/usr/bin/cp # coreutils 9.1, cp uses SEEK_HOLE/DATA to detect holes and extents CP_PROG=/home/fdmanana/git/hub/coreutils/src/cp umount $DEV &> /dev/null mkfs.btrfs -f $DEV mount $DEV $MNT FILE_SIZE=$((1024 * 1024 * 1024)) echo "Creating file with a size of $((FILE_SIZE / 1024 / 1024))M" # Create a very sparse file, where each extent has a length of 4K and # is preceded by a 4K hole and followed by another 4K hole. start=$(date +%s%N) echo -n > $MNT/foobar for ((off = 0; off < $FILE_SIZE; off += 8192)); do xfs_io -c "pwrite -S 0xab $off 4K" $MNT/foobar > /dev/null echo -ne "\r$off / $FILE_SIZE ..." done end=$(date +%s%N) echo -e "\nFile created ($(( (end - start) / 1000000 )) milliseconds)" start=$(date +%s%N) $CP_PROG $MNT/foobar /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "cp took $dur milliseconds with data/metadata cached and delalloc" # Flush all delalloc. sync start=$(date +%s%N) $CP_PROG $MNT/foobar /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "cp took $dur milliseconds with data/metadata cached and no delalloc" # Unmount and mount again to test the case without any metadata # loaded in memory. umount $MNT mount $DEV $MNT start=$(date +%s%N) $CP_PROG $MNT/foobar /dev/null end=$(date +%s%N) dur=$(( (end - start) / 1000000 )) echo "cp took $dur milliseconds without data/metadata cached and no delalloc" umount $MNT The results, running on a box with a non-debug kernel (Debian's default kernel config), were the following: 128M file, before patchset: cp took 16574 milliseconds with data/metadata cached and delalloc cp took 122 milliseconds with data/metadata cached and no delalloc cp took 20144 milliseconds without data/metadata cached and no delalloc 128M file, after patchset: cp took 6277 milliseconds with data/metadata cached and delalloc cp took 109 milliseconds with data/metadata cached and no delalloc cp took 210 milliseconds without data/metadata cached and no delalloc 512M file, before patchset: cp took 14369 milliseconds with data/metadata cached and delalloc cp took 429 milliseconds with data/metadata cached and no delalloc cp took 88034 milliseconds without data/metadata cached and no delalloc 512M file, after patchset: cp took 12106 milliseconds with data/metadata cached and delalloc cp took 427 milliseconds with data/metadata cached and no delalloc cp took 824 milliseconds without data/metadata cached and no delalloc 1G file, before patchset: cp took 10074 milliseconds with data/metadata cached and delalloc cp took 886 milliseconds with data/metadata cached and no delalloc cp took 181261 milliseconds without data/metadata cached and no delalloc 1G file, after patchset: cp took 3320 milliseconds with data/metadata cached and delalloc cp took 880 milliseconds with data/metadata cached and no delalloc cp took 1801 milliseconds without data/metadata cached and no delalloc Reported-by: Wang Yugui Link: https://lore.kernel.org/linux-btrfs/20221106073028.71F9.409509F4@e16-tech.com/ Link: https://lore.kernel.org/linux-btrfs/CAL3q7H5NSVicm7nYBJ7x8fFkDpno8z3PYt5aPU43Bajc1H0h1Q@mail.gmail.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 40 ++++++++++++++++++++++++++++++++-------- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5649f8907984..99defab7e1f6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -426,6 +426,7 @@ struct btrfs_drop_extents_args { struct btrfs_file_private { void *filldir_buf; + struct extent_state *llseek_cached_state; }; static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index da390f891220..448b143a5cb2 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1696,10 +1696,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) { struct btrfs_file_private *private = filp->private_data; - if (private && private->filldir_buf) + if (private) { kfree(private->filldir_buf); - kfree(private); - filp->private_data = NULL; + free_extent_state(private->llseek_cached_state); + kfree(private); + filp->private_data = NULL; + } /* * Set by setattr when we are about to truncate a file from a non-zero @@ -3398,13 +3400,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end, * is found, it updates @start_ret with the start of the subrange. */ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, + struct extent_state **cached_state, u64 start, u64 end, u64 *start_ret) { u64 delalloc_start; u64 delalloc_end; bool delalloc; - delalloc = btrfs_find_delalloc_in_range(inode, start, end, NULL, + delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state, &delalloc_start, &delalloc_end); if (delalloc && whence == SEEK_DATA) { *start_ret = delalloc_start; @@ -3447,11 +3450,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence, return false; } -static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, - int whence) +static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) { + struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host); + struct btrfs_file_private *private = file->private_data; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_state *cached_state = NULL; + struct extent_state **delalloc_cached_state; const loff_t i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_root *root = inode->root; @@ -3476,6 +3481,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, inode_get_bytes(&inode->vfs_inode) == i_size) return i_size; + if (!private) { + private = kzalloc(sizeof(*private), GFP_KERNEL); + /* + * No worries if memory allocation failed. + * The private structure is used only for speeding up multiple + * lseek SEEK_HOLE/DATA calls to a file when there's delalloc, + * so everything will still be correct. + */ + file->private_data = private; + } + + if (private) + delalloc_cached_state = &private->llseek_cached_state; + else + delalloc_cached_state = NULL; + /* * offset can be negative, in this case we start finding DATA/HOLE from * the very start of the file. @@ -3553,6 +3574,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, search_start = offset; found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, search_start, key.offset - 1, &found_start); @@ -3587,6 +3609,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, search_start = offset; found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, search_start, extent_end - 1, &found_start); @@ -3628,7 +3651,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, /* We have an implicit hole from the last extent found up to i_size. */ if (!found && start < i_size) { - found = find_desired_extent_in_hole(inode, whence, start, + found = find_desired_extent_in_hole(inode, whence, + delalloc_cached_state, start, i_size - 1, &start); if (!found) start = i_size; @@ -3657,7 +3681,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); - offset = find_desired_extent(BTRFS_I(inode), offset, whence); + offset = find_desired_extent(file, offset, whence); btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED); break; } From cb3e217bdb39e390f8e64af519acb02af336b53d Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sun, 13 Nov 2022 09:32:07 +0800 Subject: [PATCH 220/249] btrfs: use btrfs_dev_name() helper to handle missing devices better [BUG] If dev-replace failed to re-construct its data/metadata, the kernel message would be incorrect for the missing device: BTRFS info (device dm-1): dev_replace from (devid 2) to /dev/mapper/test-scratch2 started BTRFS error (device dm-1): failed to rebuild valid logical 38862848 for dev (efault) Note the above "dev (efault)" of the second line. While the first line is properly reporting "". [CAUSE] Although dev-replace is using btrfs_dev_name(), the heavy lifting work is still done by scrub (scrub is reused by both dev-replace and regular scrub). Unfortunately scrub code never uses btrfs_dev_name() helper, as it's only declared locally inside dev-replace.c. [FIX] Fix the output by: - Move the btrfs_dev_name() helper to volumes.h - Use btrfs_dev_name() to replace open-coded rcu_str_deref() calls Only zoned code is not touched, as I'm not familiar with degraded zoned code. - Constify return value and parameter Now the output looks pretty sane: BTRFS info (device dm-1): dev_replace from (devid 2) to /dev/mapper/test-scratch2 started BTRFS error (device dm-1): failed to rebuild valid logical 38862848 for dev Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/check-integrity.c | 2 +- fs/btrfs/dev-replace.c | 15 +++------------ fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 2 +- fs/btrfs/extent_io.c | 3 +-- fs/btrfs/ioctl.c | 4 ++-- fs/btrfs/scrub.c | 20 +++++++++----------- fs/btrfs/super.c | 2 +- fs/btrfs/volumes.c | 16 ++++++++-------- fs/btrfs/volumes.h | 9 +++++++++ 10 files changed, 36 insertions(+), 39 deletions(-) diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7ff0703ef3e4..82e49d985019 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -757,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_info_in_rcu(fs_info, "new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)", superblock_bdev, - rcu_str_deref(device->name), dev_bytenr, + btrfs_dev_name(device), dev_bytenr, dev_state->bdev, dev_bytenr, superblock_mirror_num); list_add(&superblock_tmp->all_blocks_node, diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 9c4a8649a0f4..78696d331639 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -18,7 +18,6 @@ #include "volumes.h" #include "async-thread.h" #include "check-integrity.h" -#include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" #include "zoned.h" @@ -451,14 +450,6 @@ out: return ret; } -static char* btrfs_dev_name(struct btrfs_device *device) -{ - if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) - return ""; - else - return rcu_str_deref(device->name); -} - static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_device *src_dev) { @@ -674,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, "dev_replace from %s (devid %llu) to %s started", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); /* * from now on, the writes to the srcdev are all duplicated to @@ -933,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, "btrfs_scrub_dev(%s, %llu, %s) failed %d", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name), scrub_ret); + btrfs_dev_name(tgt_device), scrub_ret); error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); @@ -951,7 +942,7 @@ error: "dev_replace from %s (devid %llu) to %s finished", btrfs_dev_name(src_device), src_device->devid, - rcu_str_deref(tgt_device->name)); + btrfs_dev_name(tgt_device)); clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state); tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 559e7f3d727e..91a088210e5a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3944,7 +3944,7 @@ static void btrfs_end_super_write(struct bio *bio) if (bio->bi_status) { btrfs_warn_rl_in_rcu(device->fs_info, "lost page write due to IO error on %s (%d)", - rcu_str_deref(device->name), + btrfs_dev_name(device), blk_status_to_errno(bio->bi_status)); ClearPageUptodate(page); SetPageError(page); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 10cc757a602b..17f599027c3d 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6048,7 +6048,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, - rcu_str_deref(device->name), + btrfs_dev_name(device), device->total_bytes); mutex_unlock(&fs_info->chunk_mutex); ret = 0; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 704ae7c08867..65ba5c3658cf 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -611,8 +611,7 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, - rcu_str_deref(dev->name), sector); + ino, start, btrfs_dev_name(dev), sector); ret = 0; out_bio_uninit: diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7a9e697d9931..bed74a3ff574 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1228,7 +1228,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret == 0 && new_size != old_size) btrfs_info_in_rcu(fs_info, "resize device %s (devid %llu) from %llu to %llu", - rcu_str_deref(device->name), device->devid, + btrfs_dev_name(device), device->devid, old_size, new_size); out_finish: btrfs_exclop_finish(fs_info); @@ -2860,7 +2860,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { - strncpy(di_args->path, rcu_str_deref(dev->name), + strncpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path) - 1); di_args->path[sizeof(di_args->path) - 1] = 0; } else { diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 288a97992aa4..0ce5b773867f 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -17,7 +17,6 @@ #include "extent_io.h" #include "dev-replace.h" #include "check-integrity.h" -#include "rcu-string.h" #include "raid56.h" #include "block-group.h" #include "zoned.h" @@ -877,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes, btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, fs_info->sectorsize, nlink, @@ -891,7 +890,7 @@ err: btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d", swarn->errstr, swarn->logical, - rcu_str_deref(swarn->dev->name), + btrfs_dev_name(swarn->dev), swarn->physical, root, inum, offset, ret); @@ -922,8 +921,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) /* Super block error, no need to search extent tree. */ if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu", - errstr, rcu_str_deref(dev->name), - sblock->physical); + errstr, btrfs_dev_name(dev), sblock->physical); return; } path = btrfs_alloc_path(); @@ -954,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) btrfs_warn_in_rcu(fs_info, "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu", errstr, swarn.logical, - rcu_str_deref(dev->name), + btrfs_dev_name(dev), swarn.physical, ref_level ? "node" : "leaf", ret < 0 ? -1 : ref_level, @@ -1377,7 +1375,7 @@ corrected_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "fixed up error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } } else { did_not_correct_error: @@ -1386,7 +1384,7 @@ did_not_correct_error: spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "unable to fixup (regular) error at logical %llu on dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } out: @@ -2332,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work) spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "IO error rebuilding logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else if (sblock->header_error || sblock->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.uncorrectable_errors++; spin_unlock(&sctx->stat_lock); btrfs_err_rl_in_rcu(fs_info, "failed to rebuild valid logical %llu for dev %s", - logical, rcu_str_deref(dev->name)); + logical, btrfs_dev_name(dev)); } else { scrub_write_block_to_dev_replace(sblock); } @@ -4303,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, mutex_unlock(&fs_info->fs_devices->device_list_mutex); btrfs_err_in_rcu(fs_info, "scrub on devid %llu: filesystem on %s is not writable", - devid, rcu_str_deref(dev->name)); + devid, btrfs_dev_name(dev)); ret = -EROFS; goto out; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d54bfec8e506..ea83dd9f735a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2336,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root) * the end of RCU grace period. */ rcu_read_lock(); - seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\"); + seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\"); rcu_read_unlock(); return 0; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 49d6e79d4343..2d0950a17f48 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -941,7 +941,7 @@ static noinline struct btrfs_device *device_list_add(const char *path, } btrfs_info_in_rcu(NULL, "devid %llu device path %s changed to %s scanned by %s (%d)", - devid, rcu_str_deref(device->name), + devid, btrfs_dev_name(device), path, current->comm, task_pid_nr(current)); } @@ -2101,7 +2101,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, if (btrfs_pinned_by_swapfile(fs_info, device)) { btrfs_warn_in_rcu(fs_info, "cannot remove device %s (devid %llu) due to active swapfile", - rcu_str_deref(device->name), device->devid); + btrfs_dev_name(device), device->devid); return -ETXTBSY; } @@ -6827,7 +6827,7 @@ static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) btrfs_debug_in_rcu(dev->fs_info, "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, - (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), dev->devid, bio->bi_iter.bi_size); btrfsic_check_bio(bio); @@ -7908,7 +7908,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "error %d while searching for dev_stats item for device %s", - ret, rcu_str_deref(device->name)); + ret, btrfs_dev_name(device)); goto out; } @@ -7919,7 +7919,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret != 0) { btrfs_warn_in_rcu(fs_info, "delete too small dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } ret = 1; @@ -7933,7 +7933,7 @@ static int update_dev_stat_item(struct btrfs_trans_handle *trans, if (ret < 0) { btrfs_warn_in_rcu(fs_info, "insert dev_stats item for device %s failed %d", - rcu_str_deref(device->name), ret); + btrfs_dev_name(device), ret); goto out; } } @@ -7998,7 +7998,7 @@ void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) return; btrfs_err_rl_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), @@ -8018,7 +8018,7 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev) btrfs_info_in_rcu(dev->fs_info, "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u", - rcu_str_deref(dev->name), + btrfs_dev_name(dev), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS), btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS), diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index efa6a3d48cd8..2c90e50c460a 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -12,6 +12,7 @@ #include "async-thread.h" #include "messages.h" #include "disk-io.h" +#include "rcu-string.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -770,6 +771,14 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, atomic_inc(&dev->dev_stats_ccnt); } +static inline const char *btrfs_dev_name(const struct btrfs_device *device) +{ + if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) + return ""; + else + return rcu_str_deref(device->name); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); From 9f0eac070d23405f18e7a84820bc3d59b1415bec Mon Sep 17 00:00:00 2001 From: Li zeming Date: Wed, 26 Oct 2022 09:36:11 +0800 Subject: [PATCH 221/249] btrfs: allocate btrfs_io_context without GFP_NOFAIL The __GFP_NOFAIL flag could loop indefinitely when allocation memory in alloc_btrfs_io_context. The callers starting from __btrfs_map_block already handle errors so it's safe to drop the flag. Signed-off-by: Li zeming Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d0950a17f48..e51fd5f1042a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5899,7 +5899,10 @@ static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_ * and the stripes. */ sizeof(u64) * (total_stripes), - GFP_NOFS|__GFP_NOFAIL); + GFP_NOFS); + + if (!bioc) + return NULL; refcount_set(&bioc->refs, 1); From cb649e81dad429ffbd97a689ac0011599952a668 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:31 +0800 Subject: [PATCH 222/249] btrfs: refactor checksum calculations in btrfs_lookup_csums_range() The refactoring involves the following parts: - Introduce bytes_to_csum_size() and csum_size_to_bytes() helpers As we have quite some open-coded calculations, some of them are even split into two assignments just to fit 80 chars limit. - Remove the @csum_size parameter from max_ordered_sum_bytes() Csum size can be fetched from @fs_info. And we will use the csum_size_to_bytes() helper anyway. - Add a comment explaining how we handle the first search result - Use newly introduced helpers to cleanup btrfs_lookup_csums_range() - Move variables declaration to the minimal scope - Never mix number of sectors with bytes There are several locations doing things like: size = min_t(size_t, csum_end - start, max_ordered_sum_bytes(fs_info)); ... size >>= fs_info->sectorsize_bits Or offset = (start - key.offset) >> fs_info->sectorsize_bits; offset *= csum_size; Make sure these variables can only represent BYTES inside the function, by using the above bytes_to_csum_size() helpers. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 68 ++++++++++++++++++++++++++++++-------------- 1 file changed, 47 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 456f71b42a9c..654ad2494bce 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -126,12 +126,26 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, start + len - 1, EXTENT_DIRTY, NULL); } -static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, - u16 csum_size) +static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) { - u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size; + ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize)); - return ncsums * fs_info->sectorsize; + return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size; +} + +static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size) +{ + ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size)); + + return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits; +} + +static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info) +{ + u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum), + fs_info->csum_size); + + return csum_size_to_bytes(fs_info, max_csum_size); } /* @@ -140,9 +154,7 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info, */ static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes) { - int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize); - - return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size; + return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes); } int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, @@ -526,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct btrfs_ordered_sum *sums; struct btrfs_csum_item *item; LIST_HEAD(tmplist); - unsigned long offset; int ret; - size_t size; - u64 csum_end; - const u32 csum_size = fs_info->csum_size; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(end + 1, fs_info->sectorsize)); @@ -556,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (ret > 0 && path->slots[0] > 0) { leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && key.type == BTRFS_EXTENT_CSUM_KEY) { - offset = (start - key.offset) >> fs_info->sectorsize_bits; - if (offset * csum_size < + if (bytes_to_csum_size(fs_info, start - key.offset) < btrfs_item_size(leaf, path->slots[0] - 1)) path->slots[0]--; } } while (start <= end) { + u64 csum_end; + leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); @@ -585,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (key.offset > start) start = key.offset; - size = btrfs_item_size(leaf, path->slots[0]); - csum_end = key.offset + (size / csum_size) * fs_info->sectorsize; + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); if (csum_end <= start) { path->slots[0]++; continue; @@ -596,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_csum_item); while (start < csum_end) { + unsigned long offset; + size_t size; + size = min_t(size_t, csum_end - start, - max_ordered_sum_bytes(fs_info, csum_size)); + max_ordered_sum_bytes(fs_info)); sums = kzalloc(btrfs_ordered_sum_size(fs_info, size), GFP_NOFS); if (!sums) { @@ -608,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, sums->bytenr = start; sums->len = (int)size; - offset = (start - key.offset) >> fs_info->sectorsize_bits; - offset *= csum_size; - size >>= fs_info->sectorsize_bits; + offset = bytes_to_csum_size(fs_info, start - key.offset); read_extent_buffer(path->nodes[0], sums->sums, ((unsigned long)item) + offset, - csum_size * size); + bytes_to_csum_size(fs_info, size)); - start += fs_info->sectorsize * size; + start += size; list_add_tail(&sums->list, &tmplist); } path->slots[0]++; From 97e3823933108cfc648bb08d5ad36251b6588164 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:32 +0800 Subject: [PATCH 223/249] btrfs: introduce a bitmap based csum range search function Although we have an existing function, btrfs_lookup_csums_range(), to find all data checksums for a range, it's based on a btrfs_ordered_sum list. For the incoming RAID56 data checksum verification at RMW time, we don't want to waste time by allocating temporary memory. So this patch will introduce a new helper, btrfs_lookup_csums_bitmap(). It will use bitmap based result, which will be a perfect fit for later RAID56 usage. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 127 +++++++++++++++++++++++++++++++++++++++++- fs/btrfs/file-item.h | 8 ++- fs/btrfs/inode.c | 5 +- fs/btrfs/relocation.c | 4 +- fs/btrfs/scrub.c | 8 +-- fs/btrfs/tree-log.c | 15 +++-- 6 files changed, 144 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 654ad2494bce..352bbb33b76f 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -527,9 +527,9 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return ret; } -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait) +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; @@ -661,6 +661,127 @@ fail: return ret; } +/* + * Do the same work as btrfs_lookup_csums_list(), the difference is in how + * we return the result. + * + * This version will set the corresponding bits in @csum_bitmap to represent + * that there is a csum found. + * Each bit represents a sector. Thus caller should ensure @csum_buf passed + * in is large enough to contain all csums. + */ +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_csum_item *item; + const u64 orig_start = start; + int ret; + + ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && + IS_ALIGNED(end + 1, fs_info->sectorsize)); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + + /* + * There are two cases we can hit here for the previous csum + * item: + * + * |<- search range ->| + * |<- csum item ->| + * + * Or + * |<- search range ->| + * |<- csum item ->| + * + * Check if the previous csum item covers the leading part of + * the search range. If so we have to start from previous csum + * item. + */ + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + if (bytes_to_csum_size(fs_info, start - key.offset) < + btrfs_item_size(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + u64 csum_end; + + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY || + key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + csum_end = key.offset + csum_size_to_bytes(fs_info, + btrfs_item_size(leaf, path->slots[0])); + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + unsigned long offset; + size_t size; + u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info, + start - orig_start); + + size = min_t(size_t, csum_end - start, end + 1 - start); + + offset = bytes_to_csum_size(fs_info, start - key.offset); + + read_extent_buffer(path->nodes[0], csum_dest, + ((unsigned long)item) + offset, + bytes_to_csum_size(fs_info, size)); + + bitmap_set(csum_bitmap, + (start - orig_start) >> fs_info->sectorsize_bits, + size >> fs_info->sectorsize_bits); + + start += size; + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + /* * Calculate checksums of the data contained inside a bio. * diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index ba12711cf933..95b371208b30 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -18,9 +18,11 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, u64 offset, bool one_ordered); -int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, - struct list_head *list, int search_commit, - bool nowait); +int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list, int search_commit, + bool nowait); +int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end, + u8 *csum_buf, unsigned long *csum_bitmap); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, const struct btrfs_path *path, struct btrfs_file_extent_item *fi, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 83898bca39d5..4248e6cabbdc 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1709,9 +1709,8 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info, int ret; LIST_HEAD(list); - ret = btrfs_lookup_csums_range(csum_root, bytenr, - bytenr + num_bytes - 1, &list, 0, - nowait); + ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1, + &list, 0, nowait); if (ret == 0 && list_empty(&list)) return 0; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 56c8afa6f6d2..aa80e51bc8ca 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4357,8 +4357,8 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len) disk_bytenr = file_pos + inode->index_cnt; csum_root = btrfs_csum_root(fs_info, disk_bytenr); - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + len - 1, &list, 0, false); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + len - 1, &list, 0, false); if (ret) goto out; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0ce5b773867f..52b346795f66 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -3238,9 +3238,9 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, extent_dev = bioc->stripes[0].dev; btrfs_put_bioc(bioc); - ret = btrfs_lookup_csums_range(csum_root, extent_start, - extent_start + extent_size - 1, - &sctx->csum_list, 1, false); + ret = btrfs_lookup_csums_list(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1, false); if (ret) { scrub_parity_mark_sectors_error(sparity, extent_start, extent_size); @@ -3464,7 +3464,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, cur_logical; if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, cur_logical, + ret = btrfs_lookup_csums_list(csum_root, cur_logical, cur_logical + scrub_len - 1, &sctx->csum_list, 1, false); if (ret) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4d9f6803bfbe..8f8d7e7dc3a8 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -826,7 +826,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, btrfs_file_extent_num_bytes(eb, item); } - ret = btrfs_lookup_csums_range(root->log_root, + ret = btrfs_lookup_csums_list(root->log_root, csum_start, csum_end - 1, &ordered_sums, 0, false); if (ret) @@ -4443,9 +4443,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, csum_root = btrfs_csum_root(trans->fs_info, disk_bytenr); disk_bytenr += extent_offset; - ret = btrfs_lookup_csums_range(csum_root, disk_bytenr, - disk_bytenr + extent_num_bytes - 1, - &ordered_sums, 0, false); + ret = btrfs_lookup_csums_list(csum_root, disk_bytenr, + disk_bytenr + extent_num_bytes - 1, + &ordered_sums, 0, false); if (ret) goto out; @@ -4638,10 +4638,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, /* block start is already adjusted for the file extent offset. */ csum_root = btrfs_csum_root(trans->fs_info, em->block_start); - ret = btrfs_lookup_csums_range(csum_root, - em->block_start + csum_offset, - em->block_start + csum_offset + - csum_len - 1, &ordered_sums, 0, false); + ret = btrfs_lookup_csums_list(csum_root, em->block_start + csum_offset, + em->block_start + csum_offset + + csum_len - 1, &ordered_sums, 0, false); if (ret) return ret; From c5a415627be4758ebdd274de7148706d6713c7ec Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:33 +0800 Subject: [PATCH 224/249] btrfs: raid56: prepare data checksums for later RMW verification This is for later data checksum verification at RMW time. This patch will try to allocate the needed memory for a locked rbio if the rbio is for data exclusively (we don't want to handle mixed bg yet). The memory will be released when the rbio is finished. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++ fs/btrfs/raid56.h | 12 ++++++++ 2 files changed, 86 insertions(+) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 11be5d0a7eab..5ef4fbb49df2 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -20,6 +20,7 @@ #include "volumes.h" #include "raid56.h" #include "async-thread.h" +#include "file-item.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -835,6 +836,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio @@ -2048,6 +2054,67 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, start_async_work(rbio, recover_rbio_work); } +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->raid_map[0]); + const u64 start = rbio->bioc->raid_map[0]; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->raid_map[0], ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; @@ -2056,6 +2123,13 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) bio_list_init(&bio_list); + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + ret = rmw_assemble_read_bios(rbio, &bio_list); if (ret < 0) goto out; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index a2e653e93fd8..7c73a443939e 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -129,6 +129,18 @@ struct btrfs_raid_bio { * Thus making it much harder to iterate. */ unsigned long *error_bitmap; + + /* + * Checksum buffer if the rbio is for data. The buffer should cover + * all data sectors (exlcuding P/Q sectors). + */ + u8 *csum_buf; + + /* + * Each bit represents if the corresponding sector has data csum found. + * Should only cover data sectors (excluding P/Q sectors). + */ + unsigned long *csum_bitmap; }; /* From 7a3150723061ba1ac2ba10b1392b1cd75234172a Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Mon, 14 Nov 2022 08:26:34 +0800 Subject: [PATCH 225/249] btrfs: raid56: do data csum verification during RMW cycle [BUG] For the following small script, btrfs will be unable to recover the content of file1: mkfs.btrfs -f -m raid1 -d raid5 -b 1G $dev1 $dev2 $dev3 mount $dev1 $mnt xfs_io -f -c "pwrite -S 0xff 0 64k" -c sync $mnt/file1 md5sum $mnt/file1 umount $mnt # Corrupt the above 64K data stripe. xfs_io -f -c "pwrite -S 0x00 323026944 64K" -c sync $dev3 mount $dev1 $mnt # Write a new 64K, which should be in the other data stripe # And this is a sub-stripe write, which will cause RMW xfs_io -f -c "pwrite 0 64k" -c sync $mnt/file2 md5sum $mnt/file1 umount $mnt Above md5sum would fail. [CAUSE] There is a long existing problem for raid56 (not limited to btrfs raid56) that, if we already have some corrupted on-disk data, and then trigger a sub-stripe write (which needs RMW cycle), it can cause further damage into P/Q stripe. Disk 1: data 1 |0x000000000000| <- Corrupted Disk 2: data 2 |0x000000000000| Disk 2: parity |0xffffffffffff| In above case, data 1 is already corrupted, the original data should be 64KiB of 0xff. At this stage, if we read data 1, and it has data checksum, we can still recovery going via the regular RAID56 recovery path. But if now we decide to write some data into data 2, then we need to go RMW. Let's say we want to write 64KiB of '0x00' into data 2, then we read the on-disk data of data 1, calculate the new parity, resulting the following layout: Disk 1: data 1 |0x000000000000| <- Corrupted Disk 2: data 2 |0x000000000000| <- New '0x00' writes Disk 2: parity |0x000000000000| <- New Parity. But the new parity is calculated using the *corrupted* data 1, we can no longer recover the correct data of data1. Thus the corruption is forever there. [FIX] To solve above problem, this patch will do a full stripe data checksum verification at RMW time. This involves the following changes: - Always read the full stripe (including data/P/Q) when doing RMW Before we only read the missing data sectors, but since we may do a data csum verification and recovery, we need to read everything out. Please note that, if we have a cached rbio, we don't need to read anything, and can treat it the same as full stripe write. As only stripe with all its csum matches can be cached. - Verify the data csum during read. The goal is only the rbio stripe sectors, and only if the rbio already has csum_buf/csum_bitmap filled. And sectors which cannot pass csum verification will have their bit set in error_bitmap. - Always call recovery_sectors() after we read out all the sectors Since error_bitmap will be updated during read, recover_sectors() can easily find out all the bad sectors and try to recover (if still under tolerance). And since recovery_sectors() is already migrated to use error_bitmap, it can skip vertical stripes which don't have any error. - Verify the repaired sectors against its csum in recover_vertical() - Rename rmw_read_and_wait() to rmw_read_wait_recover() Since we will always recover the sectors, the old name is no longer accurate. Furthermore since recovery is already done in rmw_read_wait_recover(), we no longer need to call recovery_sectors() inside rmw_rbio(). Obviously this will have a performance impact, as we are doing more work during RMW cycle: - Fetch the data checksums - Do checksum verification for all data stripes - Do checksum verification again after repair But for full stripe write or cached rbio we won't have the overhead all, thus for fully optimized RAID56 workload (always full stripe write), there should be no extra overhead. To me, the extra overhead looks reasonable, as data consistency is way more important than performance. Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 169 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 137 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 5ef4fbb49df2..2d90a6b5eb00 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -21,6 +21,7 @@ #include "raid56.h" #include "async-thread.h" #include "file-item.h" +#include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -1433,14 +1434,56 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi bio_size >> rbio->bioc->fs_info->sectorsize_bits); } +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; + + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) + return; + + bio_for_each_segment_all(bvec, bio, iter_all) { + int bv_offset; + + for (bv_offset = bvec->bv_offset; + bv_offset < bvec->bv_offset + bvec->bv_len; + bv_offset += fs_info->sectorsize, total_sector_nr++) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + + total_sector_nr * fs_info->csum_size; + int ret; + + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; + + ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, + bv_offset, csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + } + } +} + static void raid_wait_read_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - if (bio->bi_status) + if (bio->bi_status) { rbio_update_error_bitmap(rbio, bio); - else + } else { set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); + } bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -1469,37 +1512,25 @@ static void submit_read_bios(struct btrfs_raid_bio *rbio, static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, struct bio_list *bio_list) { - const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; struct bio *bio; int total_sector_nr; int ret = 0; ASSERT(bio_list_size(bio_list) == 0); - /* Build a list of bios to read all the missing data sectors. */ - for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behaviro is to compensate the later csum verification and + * recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; int stripe = total_sector_nr / rbio->stripe_nsectors; int sectornr = total_sector_nr % rbio->stripe_nsectors; - /* - * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a page - * in the bio list we don't need to read it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. If so, - * use it. - */ - if (sector->uptodate) - continue; - ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) @@ -1670,6 +1701,42 @@ fail: bio_endio(bio); } +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; + int ret; + + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; + /* + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. + */ + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); + } + + ASSERT(sector->page); + + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, + csum_buf, csum_expected); + return ret; +} + /* * Recover a vertical stripe specified by @sector_nr. * @*pointers are the pre-allocated pointers by the caller, so we don't @@ -1685,6 +1752,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, int faila; int failb; int stripe_nr; + int ret = 0; /* * Now we just use bitmap to mark the horizontal stripes in @@ -1805,12 +1873,23 @@ pstripe: * uptodate. * Especially if we determine to cache the rbio, we need to * have at least all data sectors uptodate. + * + * If possible, also check if the repaired sector matches its data + * checksum. */ if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + sector = rbio_stripe_sector(rbio, faila, sector_nr); sector->uptodate = 1; } if (failb >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + sector = rbio_stripe_sector(rbio, failb, sector_nr); sector->uptodate = 1; } @@ -1818,7 +1897,7 @@ pstripe: cleanup: for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) kunmap_local(unmap_array[stripe_nr]); - return 0; + return ret; } static int recover_sectors(struct btrfs_raid_bio *rbio) @@ -2115,7 +2194,7 @@ no_csum: rbio->csum_bitmap = NULL; } -static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; struct bio *bio; @@ -2136,6 +2215,12 @@ static int rmw_read_and_wait(struct btrfs_raid_bio *rbio) submit_read_bios(rbio, &bio_list); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + ret = recover_sectors(rbio); return ret; out: while ((bio = bio_list_pop(&bio_list))) @@ -2175,6 +2260,28 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio, } } +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + /* + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. + */ + if (!sector->page || !sector->uptodate) + return true; + } + return false; +} + static int rmw_rbio(struct btrfs_raid_bio *rbio) { struct bio_list bio_list; @@ -2189,9 +2296,13 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) if (ret < 0) return ret; - /* Full stripe write, can write the full stripe right now. */ - if (rbio_is_full(rbio)) + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ + if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) goto write; + /* * Now we're doing sub-stripe write, also need all data stripes to do * the full RMW. @@ -2202,16 +2313,10 @@ static int rmw_rbio(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - ret = rmw_read_and_wait(rbio); + ret = rmw_read_wait_recover(rbio); if (ret < 0) return ret; - /* We have read errors, try recovery path. */ - if (!bitmap_empty(rbio->error_bitmap, rbio->nr_sectors)) { - ret = recover_rbio(rbio); - if (ret < 0) - return ret; - } write: /* * At this stage we're not allowed to add any new bios to the From 27137fac4c0628fc8320bb7f1ce3bb9f84b76a9b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:44:04 +0100 Subject: [PATCH 226/249] btrfs: move struct btrfs_tree_parent_check out of disk-io.h Move struct btrfs_tree_parent_check out of disk-io.h so that volumes.h an various .c files don't have to include disk-io.h just for it. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba [ use tree-checker.h for the structure ] Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 + fs/btrfs/disk-io.h | 30 +----------------------------- fs/btrfs/print-tree.c | 1 + fs/btrfs/qgroup.c | 1 + fs/btrfs/tree-checker.h | 35 +++++++++++++++++++++++++++++++++-- fs/btrfs/tree-mod-log.c | 1 + fs/btrfs/volumes.h | 2 +- 7 files changed, 39 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 55c072ba6747..21c92c74bf71 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -19,6 +19,7 @@ #include "accessors.h" #include "extent-tree.h" #include "relocation.h" +#include "tree-checker.h" /* Just arbitrary numbers so we can be sure one of these happened. */ #define BACKREF_FOUND_SHARED 6 diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 03fe4154ffb8..363935cfc084 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,37 +25,9 @@ static inline u64 btrfs_sb_offset(int mirror) return BTRFS_SUPER_INFO_OFFSET; } -/* All the extra info needed to verify the parentness of a tree block. */ -struct btrfs_tree_parent_check { - /* - * The owner check against the tree block. - * - * Can be 0 to skip the owner check. - */ - u64 owner_root; - - /* - * Expected transid, can be 0 to skip the check, but such skip - * should only be utlized for backref walk related code. - */ - u64 transid; - - /* - * The expected first key. - * - * This check can be skipped if @has_first_key is false, such skip - * can happen for case where we don't have the parent node key, - * e.g. reading the tree root, doing backref walk. - */ - struct btrfs_key first_key; - bool has_first_key; - - /* The expected level. Should always be set. */ - u8 level; -}; - struct btrfs_device; struct btrfs_fs_devices; +struct btrfs_tree_parent_check; void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info); void btrfs_init_fs_info(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 1469aa55ad48..b93c96213304 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -8,6 +8,7 @@ #include "disk-io.h" #include "print-tree.h" #include "accessors.h" +#include "tree-checker.h" struct root_name_map { u64 id; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index e0522c6c0d67..5c636e00d77d 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -28,6 +28,7 @@ #include "accessors.h" #include "extent-tree.h" #include "root-tree.h" +#include "tree-checker.h" /* * Helpers to access qgroup reservation diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index ece497e26558..bfb5efa4e01f 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -6,8 +6,39 @@ #ifndef BTRFS_TREE_CHECKER_H #define BTRFS_TREE_CHECKER_H -#include "ctree.h" -#include "extent_io.h" +#include + +struct extent_buffer; +struct btrfs_chunk; + +/* All the extra info needed to verify the parentness of a tree block. */ +struct btrfs_tree_parent_check { + /* + * The owner check against the tree block. + * + * Can be 0 to skip the owner check. + */ + u64 owner_root; + + /* + * Expected transid, can be 0 to skip the check, but such skip + * should only be utlized for backref walk related code. + */ + u64 transid; + + /* + * The expected first key. + * + * This check can be skipped if @has_first_key is false, such skip + * can happen for case where we don't have the parent node key, + * e.g. reading the tree root, doing backref walk. + */ + struct btrfs_key first_key; + bool has_first_key; + + /* The expected level. Should always be set. */ + u8 level; +}; /* * Comprehensive leaf checker. diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 779ad44d285f..146a6b198933 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -5,6 +5,7 @@ #include "disk-io.h" #include "fs.h" #include "accessors.h" +#include "tree-checker.h" struct tree_mod_root { u64 logical; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2c90e50c460a..ab551471c1f8 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -11,7 +11,7 @@ #include #include "async-thread.h" #include "messages.h" -#include "disk-io.h" +#include "tree-checker.h" #include "rcu-string.h" #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) From 103c19723c80bf74e4e70cd6cde3b8783a27aceb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:44:05 +0100 Subject: [PATCH 227/249] btrfs: split the bio submission path into a separate file The code used by btrfs_submit_bio only interacts with the rest of volumes.c through __btrfs_map_block (which itself is a more generic version of two exported helpers) and does not really have anything to do with volumes.c. Create a new bio.c file and a bio.h header going along with it for the btrfs_bio-based storage layer, which will grow even more going forward. Also update the file with my copyright notice given that a large part of the moved code was written or rewritten by me. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Makefile | 2 +- fs/btrfs/bio.c | 291 ++++++++++++++++++++++++++++++++++++++++ fs/btrfs/bio.h | 127 ++++++++++++++++++ fs/btrfs/compression.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-tree.c | 1 + fs/btrfs/extent_io.c | 2 +- fs/btrfs/file-item.c | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/relocation.c | 1 + fs/btrfs/super.c | 2 +- fs/btrfs/tree-log.c | 1 + fs/btrfs/volumes.c | 296 +---------------------------------------- fs/btrfs/volumes.h | 110 +-------------- 14 files changed, 438 insertions(+), 403 deletions(-) create mode 100644 fs/btrfs/bio.c create mode 100644 fs/btrfs/bio.h diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 84fb3b4c35b0..555c962fdad6 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o + subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c new file mode 100644 index 000000000000..9e881dc91bae --- /dev/null +++ b/fs/btrfs/bio.c @@ -0,0 +1,291 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#include +#include "bio.h" +#include "ctree.h" +#include "volumes.h" +#include "raid56.h" +#include "async-thread.h" +#include "check-integrity.h" +#include "dev-replace.h" +#include "rcu-string.h" +#include "zoned.h" + +static struct bio_set btrfs_bioset; + +/* + * Initialize a btrfs_bio structure. This skips the embedded bio itself as it + * is already initialized by the block layer. + */ +static inline void btrfs_bio_init(struct btrfs_bio *bbio, + btrfs_bio_end_io_t end_io, void *private) +{ + memset(bbio, 0, offsetof(struct btrfs_bio, bio)); + bbio->end_io = end_io; + bbio->private = private; +} + +/* + * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for + * btrfs, and is used for all I/O submitted through btrfs_submit_bio. + * + * Just like the underlying bio_alloc_bioset it will not fail as it is backed by + * a mempool. + */ +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + + bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); + btrfs_bio_init(btrfs_bio(bio), end_io, private); + return bio; +} + +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private) +{ + struct bio *bio; + struct btrfs_bio *bbio; + + ASSERT(offset <= UINT_MAX && size <= UINT_MAX); + + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); + bbio = btrfs_bio(bio); + btrfs_bio_init(bbio, end_io, private); + + bio_trim(bio, offset >> 9, size >> 9); + bbio->iter = bio->bi_iter; + return bio; +} + +static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) +{ + if (!dev || !dev->bdev) + return; + if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + return; + + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + if (!(bio->bi_opf & REQ_RAHEAD)) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); + if (bio->bi_opf & REQ_PREFLUSH) + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); +} + +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + if (bio->bi_opf & REQ_META) + return fs_info->endio_meta_workers; + return fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) +{ + struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work); + + bbio->end_io(bbio); +} + +static void btrfs_simple_end_io(struct bio *bio) +{ + struct btrfs_fs_info *fs_info = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(fs_info); + + if (bio->bi_status) + btrfs_log_dev_io_error(bio, bbio->device); + + if (bio_op(bio) == REQ_OP_READ) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); + } else { + bbio->end_io(bbio); + } +} + +static void btrfs_raid56_end_io(struct bio *bio) +{ + struct btrfs_io_context *bioc = bio->bi_private; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + bbio->mirror_num = bioc->mirror_num; + bbio->end_io(bbio); + + btrfs_put_bioc(bioc); +} + +static void btrfs_orig_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; + struct btrfs_bio *bbio = btrfs_bio(bio); + + btrfs_bio_counter_dec(bioc->fs_info); + + if (bio->bi_status) { + atomic_inc(&bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_status = BLK_STS_OK; + + bbio->end_io(bbio); + btrfs_put_bioc(bioc); +} + +static void btrfs_clone_write_end_io(struct bio *bio) +{ + struct btrfs_io_stripe *stripe = bio->bi_private; + + if (bio->bi_status) { + atomic_inc(&stripe->bioc->error); + btrfs_log_dev_io_error(bio, stripe->dev); + } + + /* Pass on control to the original bio this one was cloned from */ + bio_endio(stripe->bioc->orig_bio); + bio_put(bio); +} + +static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) +{ + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + bio_io_error(bio); + return; + } + + bio_set_dev(bio, dev->bdev); + + /* + * For zone append writing, bi_sector must point the beginning of the + * zone + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; + + if (btrfs_dev_is_sequential(dev, physical)) { + u64 zone_start = round_down(physical, + dev->fs_info->zone_size); + + bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; + } else { + bio->bi_opf &= ~REQ_OP_ZONE_APPEND; + bio->bi_opf |= REQ_OP_WRITE; + } + } + btrfs_debug_in_rcu(dev->fs_info, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), + dev->devid, bio->bi_iter.bi_size); + + btrfsic_check_bio(bio); + submit_bio(bio); +} + +static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) +{ + struct bio *orig_bio = bioc->orig_bio, *bio; + + ASSERT(bio_op(orig_bio) != REQ_OP_READ); + + /* Reuse the bio embedded into the btrfs_bio for the last mirror */ + if (dev_nr == bioc->num_stripes - 1) { + bio = orig_bio; + bio->bi_end_io = btrfs_orig_write_end_io; + } else { + bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); + bio_inc_remaining(orig_bio); + bio->bi_end_io = btrfs_clone_write_end_io; + } + + bio->bi_private = &bioc->stripes[dev_nr]; + bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; + bioc->stripes[dev_nr].bioc = bioc; + btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); +} + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) +{ + u64 logical = bio->bi_iter.bi_sector << 9; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; + struct btrfs_io_context *bioc = NULL; + struct btrfs_io_stripe smap; + int ret; + + btrfs_bio_counter_inc_blocked(fs_info); + ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num, 1); + if (ret) { + btrfs_bio_counter_dec(fs_info); + btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); + return; + } + + if (map_length < length) { + btrfs_crit(fs_info, + "mapping failed logical %llu bio len %llu len %llu", + logical, length, map_length); + BUG(); + } + + if (!bioc) { + /* Single mirror read/write fast path */ + btrfs_bio(bio)->mirror_num = mirror_num; + btrfs_bio(bio)->device = smap.dev; + bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; + bio->bi_private = fs_info; + bio->bi_end_io = btrfs_simple_end_io; + btrfs_submit_dev_bio(smap.dev, bio); + } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + /* Parity RAID write or read recovery */ + bio->bi_private = bioc; + bio->bi_end_io = btrfs_raid56_end_io; + if (bio_op(bio) == REQ_OP_READ) + raid56_parity_recover(bio, bioc, mirror_num); + else + raid56_parity_write(bio, bioc); + } else { + /* Write to multiple mirrors */ + int total_devs = bioc->num_stripes; + int dev_nr; + + bioc->orig_bio = bio; + for (dev_nr = 0; dev_nr < total_devs; dev_nr++) + btrfs_submit_mirrored_bio(bioc, dev_nr); + } +} + +int __init btrfs_bioset_init(void) +{ + if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_bio, bio), + BIOSET_NEED_BVECS)) + return -ENOMEM; + return 0; +} + +void __cold btrfs_bioset_exit(void) +{ + bioset_exit(&btrfs_bioset); +} diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h new file mode 100644 index 000000000000..b12f84b3b341 --- /dev/null +++ b/fs/btrfs/bio.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * Copyright (C) 2022 Christoph Hellwig. + */ + +#ifndef BTRFS_BIO_H +#define BTRFS_BIO_H + +#include +#include +#include "tree-checker.h" + +struct btrfs_bio; +struct btrfs_fs_info; + +#define BTRFS_BIO_INLINE_CSUM_SIZE 64 + +/* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); + +/* + * Additional info to pass along bio. + * + * Mostly for btrfs specific features like csum and mirror_num. + */ +struct btrfs_bio { + unsigned int mirror_num:7; + + /* + * Extra indicator for metadata bios. + * For some btrfs bios they use pages without a mapping, thus + * we can not rely on page->mapping->host to determine if + * it's a metadata bio. + */ + unsigned int is_metadata:1; + struct bvec_iter iter; + + /* for direct I/O */ + u64 file_offset; + + /* @device is for stripe IO submission. */ + struct btrfs_device *device; + union { + /* For data checksum verification. */ + struct { + u8 *csum; + u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; + }; + + /* For metadata parentness verification. */ + struct btrfs_tree_parent_check parent_check; + }; + + /* End I/O information supplied to btrfs_bio_alloc */ + btrfs_bio_end_io_t end_io; + void *private; + + /* For read end I/O handling */ + struct work_struct end_io_work; + + /* + * This member must come last, bio_alloc_bioset will allocate enough + * bytes for entire btrfs_bio but relies on bio being last. + */ + struct bio bio; +}; + +static inline struct btrfs_bio *btrfs_bio(struct bio *bio) +{ + return container_of(bio, struct btrfs_bio, bio); +} + +int __init btrfs_bioset_init(void); +void __cold btrfs_bioset_exit(void); + +struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, + btrfs_bio_end_io_t end_io, void *private); +struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, + btrfs_bio_end_io_t end_io, void *private); + + +static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) +{ + bbio->bio.bi_status = status; + bbio->end_io(bbio); +} + +static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) +{ + if (bbio->is_metadata) + return; + if (bbio->csum != bbio->csum_inline) { + kfree(bbio->csum); + bbio->csum = NULL; + } +} + +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, + int mirror_num); +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num); + +#endif diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 30adf430241e..5122ca79f7ea 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -27,7 +27,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "ordered-data.h" #include "compression.h" #include "extent_io.h" diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 91a088210e5a..d5be259845d1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -23,7 +23,7 @@ #include "disk-io.h" #include "transaction.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "locking.h" #include "tree-log.h" diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 17f599027c3d..892d78c1853c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -42,6 +42,7 @@ #include "root-tree.h" #include "file-item.h" #include "orphan.h" +#include "tree-checker.h" #undef SCRAMBLE_DELAYED_REFS diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 65ba5c3658cf..95d54b5834c0 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -20,7 +20,7 @@ #include "extent_map.h" #include "ctree.h" #include "btrfs_inode.h" -#include "volumes.h" +#include "bio.h" #include "check-integrity.h" #include "locking.h" #include "rcu-string.h" diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 352bbb33b76f..5de73466b2ca 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -14,7 +14,7 @@ #include "ctree.h" #include "disk-io.h" #include "transaction.h" -#include "volumes.h" +#include "bio.h" #include "print-tree.h" #include "compression.h" #include "fs.h" diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4248e6cabbdc..905ea19df125 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -43,7 +43,7 @@ #include "ordered-data.h" #include "xattr.h" #include "tree-log.h" -#include "volumes.h" +#include "bio.h" #include "compression.h" #include "locking.h" #include "free-space-cache.h" diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index aa80e51bc8ca..31ec4a7658ce 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -35,6 +35,7 @@ #include "file-item.h" #include "relocation.h" #include "super.h" +#include "tree-checker.h" /* * Relocation overview diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index ea83dd9f735a..93f52ee85f6f 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -35,7 +35,7 @@ #include "print-tree.h" #include "props.h" #include "xattr.h" -#include "volumes.h" +#include "bio.h" #include "export.h" #include "compression.h" #include "rcu-string.h" diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 8f8d7e7dc3a8..4387cd2ade97 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -29,6 +29,7 @@ #include "file-item.h" #include "file.h" #include "orphan.h" +#include "tree-checker.h" #define MAX_CONFLICT_INODES 10 diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e51fd5f1042a..acab20f2863d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -5,12 +5,9 @@ #include #include -#include #include -#include #include #include -#include #include #include #include @@ -23,8 +20,6 @@ #include "print-tree.h" #include "volumes.h" #include "raid56.h" -#include "async-thread.h" -#include "check-integrity.h" #include "rcu-string.h" #include "dev-replace.h" #include "sysfs.h" @@ -41,8 +36,6 @@ #include "scrub.h" #include "super.h" -static struct bio_set btrfs_bioset; - #define BTRFS_BLOCK_GROUP_STRIPE_MASK (BTRFS_BLOCK_GROUP_RAID0 | \ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -255,11 +248,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, - int *mirror_num_ret, int need_raid_map); /* * Device locking @@ -6364,11 +6352,11 @@ static void set_io_stripe(struct btrfs_io_stripe *dst, const struct map_lookup * stripe_offset + stripe_nr * map->stripe_len; } -static int __btrfs_map_block(struct btrfs_fs_info *fs_info, - enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_io_context **bioc_ret, - struct btrfs_io_stripe *smap, - int *mirror_num_ret, int need_raid_map) +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map) { struct extent_map *em; struct map_lookup *map; @@ -6651,266 +6639,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, NULL, NULL, 1); } -/* - * Initialize a btrfs_bio structure. This skips the embedded bio itself as it - * is already initialized by the block layer. - */ -static inline void btrfs_bio_init(struct btrfs_bio *bbio, - btrfs_bio_end_io_t end_io, void *private) -{ - memset(bbio, 0, offsetof(struct btrfs_bio, bio)); - bbio->end_io = end_io; - bbio->private = private; -} - -/* - * Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for - * btrfs, and is used for all I/O submitted through btrfs_submit_bio. - * - * Just like the underlying bio_alloc_bioset it will not fail as it is backed by - * a mempool. - */ -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - btrfs_bio_end_io_t end_io, void *private) -{ - struct bio *bio; - - bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset); - btrfs_bio_init(btrfs_bio(bio), end_io, private); - return bio; -} - -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private) -{ - struct bio *bio; - struct btrfs_bio *bbio; - - ASSERT(offset <= UINT_MAX && size <= UINT_MAX); - - bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(bio); - btrfs_bio_init(bbio, end_io, private); - - bio_trim(bio, offset >> 9, size >> 9); - bbio->iter = bio->bi_iter; - return bio; -} - -static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) -{ - if (!dev || !dev->bdev) - return; - if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) - return; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); - if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); -} - -static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info, - struct bio *bio) -{ - if (bio->bi_opf & REQ_META) - return fs_info->endio_meta_workers; - return fs_info->endio_workers; -} - -static void btrfs_end_bio_work(struct work_struct *work) -{ - struct btrfs_bio *bbio = - container_of(work, struct btrfs_bio, end_io_work); - - bbio->end_io(bbio); -} - -static void btrfs_simple_end_io(struct bio *bio) -{ - struct btrfs_fs_info *fs_info = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(fs_info); - - if (bio->bi_status) - btrfs_log_dev_io_error(bio, bbio->device); - - if (bio_op(bio) == REQ_OP_READ) { - INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); - queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work); - } else { - bbio->end_io(bbio); - } -} - -static void btrfs_raid56_end_io(struct bio *bio) -{ - struct btrfs_io_context *bioc = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - bbio->mirror_num = bioc->mirror_num; - bbio->end_io(bbio); - - btrfs_put_bioc(bioc); -} - -static void btrfs_orig_write_end_io(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - struct btrfs_io_context *bioc = stripe->bioc; - struct btrfs_bio *bbio = btrfs_bio(bio); - - btrfs_bio_counter_dec(bioc->fs_info); - - if (bio->bi_status) { - atomic_inc(&bioc->error); - btrfs_log_dev_io_error(bio, stripe->dev); - } - - /* - * Only send an error to the higher layers if it is beyond the tolerance - * threshold. - */ - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - - bbio->end_io(bbio); - btrfs_put_bioc(bioc); -} - -static void btrfs_clone_write_end_io(struct bio *bio) -{ - struct btrfs_io_stripe *stripe = bio->bi_private; - - if (bio->bi_status) { - atomic_inc(&stripe->bioc->error); - btrfs_log_dev_io_error(bio, stripe->dev); - } - - /* Pass on control to the original bio this one was cloned from */ - bio_endio(stripe->bioc->orig_bio); - bio_put(bio); -} - -static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio) -{ - if (!dev || !dev->bdev || - test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || - (btrfs_op(bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bio_io_error(bio); - return; - } - - bio_set_dev(bio, dev->bdev); - - /* - * For zone append writing, bi_sector must point the beginning of the - * zone - */ - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT; - - if (btrfs_dev_is_sequential(dev, physical)) { - u64 zone_start = round_down(physical, - dev->fs_info->zone_size); - - bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT; - } else { - bio->bi_opf &= ~REQ_OP_ZONE_APPEND; - bio->bi_opf |= REQ_OP_WRITE; - } - } - btrfs_debug_in_rcu(dev->fs_info, - "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, - (unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev), - dev->devid, bio->bi_iter.bi_size); - - btrfsic_check_bio(bio); - submit_bio(bio); -} - -static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr) -{ - struct bio *orig_bio = bioc->orig_bio, *bio; - - ASSERT(bio_op(orig_bio) != REQ_OP_READ); - - /* Reuse the bio embedded into the btrfs_bio for the last mirror */ - if (dev_nr == bioc->num_stripes - 1) { - bio = orig_bio; - bio->bi_end_io = btrfs_orig_write_end_io; - } else { - bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set); - bio_inc_remaining(orig_bio); - bio->bi_end_io = btrfs_clone_write_end_io; - } - - bio->bi_private = &bioc->stripes[dev_nr]; - bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT; - bioc->stripes[dev_nr].bioc = bioc; - btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio); -} - -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = bio->bi_iter.bi_size; - u64 map_length = length; - struct btrfs_io_context *bioc = NULL; - struct btrfs_io_stripe smap; - int ret; - - btrfs_bio_counter_inc_blocked(fs_info); - ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num, 1); - if (ret) { - btrfs_bio_counter_dec(fs_info); - btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret)); - return; - } - - if (map_length < length) { - btrfs_crit(fs_info, - "mapping failed logical %llu bio len %llu len %llu", - logical, length, map_length); - BUG(); - } - - if (!bioc) { - /* Single mirror read/write fast path */ - btrfs_bio(bio)->mirror_num = mirror_num; - btrfs_bio(bio)->device = smap.dev; - bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - bio->bi_private = fs_info; - bio->bi_end_io = btrfs_simple_end_io; - btrfs_submit_dev_bio(smap.dev, bio); - } else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* Parity RAID write or read recovery */ - bio->bi_private = bioc; - bio->bi_end_io = btrfs_raid56_end_io; - if (bio_op(bio) == REQ_OP_READ) - raid56_parity_recover(bio, bioc, mirror_num); - else - raid56_parity_write(bio, bioc); - } else { - /* Write to multiple mirrors */ - int total_devs = bioc->num_stripes; - int dev_nr; - - bioc->orig_bio = bio; - for (dev_nr = 0; dev_nr < total_devs; dev_nr++) - btrfs_submit_mirrored_bio(bioc, dev_nr); - } -} - static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, const struct btrfs_fs_devices *fs_devices) { @@ -8440,17 +8168,3 @@ bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical) return true; } - -int __init btrfs_bioset_init(void) -{ - if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, - offsetof(struct btrfs_bio, bio), - BIOSET_NEED_BVECS)) - return -ENOMEM; - return 0; -} - -void __cold btrfs_bioset_exit(void) -{ - bioset_exit(&btrfs_bioset); -} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ab551471c1f8..6b7a05f6cf82 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -6,7 +6,6 @@ #ifndef BTRFS_VOLUMES_H #define BTRFS_VOLUMES_H -#include #include #include #include "async-thread.h" @@ -373,8 +372,6 @@ struct btrfs_fs_devices { enum btrfs_read_policy read_policy; }; -#define BTRFS_BIO_INLINE_CSUM_SIZE 64 - #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ - sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) @@ -384,107 +381,6 @@ struct btrfs_fs_devices { - 2 * sizeof(struct btrfs_chunk)) \ / sizeof(struct btrfs_stripe) + 1) -/* - * Maximum number of sectors for a single bio to limit the size of the - * checksum array. This matches the number of bio_vecs per bio and thus the - * I/O size for buffered I/O. - */ -#define BTRFS_MAX_BIO_SECTORS (256) - -typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio); - -/* - * Additional info to pass along bio. - * - * Mostly for btrfs specific features like csum and mirror_num. - */ -struct btrfs_bio { - unsigned int mirror_num:7; - - /* - * Extra indicator for metadata bios. - * For some btrfs bios they use pages without a mapping, thus - * we can not rely on page->mapping->host to determine if - * it's a metadata bio. - */ - unsigned int is_metadata:1; - struct bvec_iter iter; - - /* for direct I/O */ - u64 file_offset; - - /* @device is for stripe IO submission. */ - struct btrfs_device *device; - union { - /* For data checksum verification. */ - struct { - u8 *csum; - u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; - }; - - /* For metadata parentness verification. */ - struct btrfs_tree_parent_check parent_check; - }; - - /* End I/O information supplied to btrfs_bio_alloc */ - btrfs_bio_end_io_t end_io; - void *private; - - /* For read end I/O handling */ - struct work_struct end_io_work; - - /* - * This member must come last, bio_alloc_bioset will allocate enough - * bytes for entire btrfs_bio but relies on bio being last. - */ - struct bio bio; -}; - -static inline struct btrfs_bio *btrfs_bio(struct bio *bio) -{ - return container_of(bio, struct btrfs_bio, bio); -} - -int __init btrfs_bioset_init(void); -void __cold btrfs_bioset_exit(void); - -struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf, - btrfs_bio_end_io_t end_io, void *private); -struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size, - btrfs_bio_end_io_t end_io, void *private); - -static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) -{ - bbio->bio.bi_status = status; - bbio->end_io(bbio); -} - -static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) -{ - if (bbio->is_metadata) - return; - if (bbio->csum != bbio->csum_inline) { - kfree(bbio->csum); - bbio->csum = NULL; - } -} - -/* - * Iterate through a btrfs_bio (@bbio) on a per-sector basis. - * - * bvl - struct bio_vec - * bbio - struct btrfs_bio - * iters - struct bvec_iter - * bio_offset - unsigned int - */ -#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ - for ((iter) = (bbio)->iter, (bio_offset) = 0; \ - (iter).bi_size && \ - (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ - (bio_offset) += fs_info->sectorsize, \ - bio_advance_iter_single(&(bbio)->bio, &(iter), \ - (fs_info)->sectorsize)) - struct btrfs_io_stripe { struct btrfs_device *dev; union { @@ -641,6 +537,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_io_context **bioc_ret, + struct btrfs_io_stripe *smap, int *mirror_num_ret, + int need_raid_map); struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes); @@ -652,7 +553,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, From bacf60e515862904dd91a332f3a54f092416eaa6 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Nov 2022 10:44:06 +0100 Subject: [PATCH 228/249] btrfs: move repair_io_failure to bio.c repair_io_failure ties directly into all the glory low-level details of mapping a bio with a logic address to the actual physical location. Move it right below btrfs_submit_bio to keep all the related logic together. Also move btrfs_repair_eb_io_failure to its caller in disk-io.c now that repair_io_failure is available in a header. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 90 +++++++++++++++++++++++++++++++++ fs/btrfs/disk-io.c | 24 +++++++++ fs/btrfs/extent_io.c | 117 +------------------------------------------ fs/btrfs/extent_io.h | 1 - 4 files changed, 116 insertions(+), 116 deletions(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 9e881dc91bae..b8fb7ef6b520 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -276,6 +276,96 @@ void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror } } +/* + * Submit a repair write. + * + * This bypasses btrfs_submit_bio deliberately, as that writes all copies in a + * RAID setup. Here we only want to write the one bad copy, so we do the + * mapping ourselves and submit the bio directly. + * + * The I/O is issued sychronously to block the repair read completion from + * freeing the bio. + */ +int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, + u64 length, u64 logical, struct page *page, + unsigned int pg_offset, int mirror_num) +{ + struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; + u64 map_length = 0; + u64 sector; + struct btrfs_io_context *bioc = NULL; + int ret = 0; + + ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); + BUG_ON(!mirror_num); + + if (btrfs_repair_one_zone(fs_info, logical)) + return 0; + + map_length = length; + + /* + * Avoid races with device replace and make sure our bioc has devices + * associated to its stripes that don't go away while we are doing the + * read repair operation. + */ + btrfs_bio_counter_inc_blocked(fs_info); + if (btrfs_is_parity_mirror(fs_info, logical, length)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bioc, 0); + if (ret) + goto out_counter_dec; + ASSERT(bioc->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bioc, mirror_num); + if (ret) + goto out_counter_dec; + BUG_ON(mirror_num != bioc->mirror_num); + } + + sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; + dev = bioc->stripes[bioc->mirror_num - 1].dev; + btrfs_put_bioc(bioc); + + if (!dev || !dev->bdev || + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { + ret = -EIO; + goto out_counter_dec; + } + + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { + /* try to remap that extent elsewhere? */ + btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); + goto out_bio_uninit; + } + + btrfs_info_rl_in_rcu(fs_info, + "read error corrected: ino %llu off %llu (dev %s sector %llu)", + ino, start, btrfs_dev_name(dev), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: + btrfs_bio_counter_dec(fs_info); + return ret; +} + int __init btrfs_bioset_init(void) { if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d5be259845d1..0888d484df80 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -255,6 +255,30 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, return ret; } +static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, + int mirror_num) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u64 start = eb->start; + int i, num_pages = num_extent_pages(eb); + int ret = 0; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + for (i = 0; i < num_pages; i++) { + struct page *p = eb->pages[i]; + + ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE, + start, p, start - page_offset(p), mirror_num); + if (ret) + break; + start += PAGE_SIZE; + } + + return ret; +} + /* * helper to read a given tree block, doing retries as required when * the checksums don't match and we have alternate mirrors to try. diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 95d54b5834c0..8528e7d3f38f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -531,119 +531,6 @@ static void free_io_failure(struct btrfs_inode *inode, kfree(rec); } -/* - * this bypasses the standard btrfs submit functions deliberately, as - * the standard behavior is to write all copies in a raid setup. here we only - * want to write the one bad copy. so we do the mapping for ourselves and issue - * submit_bio directly. - * to avoid any synchronization issues, wait for the data after writing, which - * actually prevents the read that triggered the error from finishing. - * currently, there can be no more than two copies of every data bit. thus, - * exactly one rewrite is required. - */ -static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) -{ - struct btrfs_device *dev; - struct bio_vec bvec; - struct bio bio; - u64 map_length = 0; - u64 sector; - struct btrfs_io_context *bioc = NULL; - int ret = 0; - - ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); - BUG_ON(!mirror_num); - - if (btrfs_repair_one_zone(fs_info, logical)) - return 0; - - map_length = length; - - /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are doing the - * read repair operation. - */ - btrfs_bio_counter_inc_blocked(fs_info); - if (btrfs_is_parity_mirror(fs_info, logical, length)) { - /* - * Note that we don't use BTRFS_MAP_WRITE because it's supposed - * to update all raid stripes, but here we just want to correct - * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad - * stripe's dev and sector. - */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, - &map_length, &bioc, 0); - if (ret) - goto out_counter_dec; - ASSERT(bioc->mirror_num == 1); - } else { - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bioc, mirror_num); - if (ret) - goto out_counter_dec; - BUG_ON(mirror_num != bioc->mirror_num); - } - - sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - dev = bioc->stripes[bioc->mirror_num - 1].dev; - btrfs_put_bioc(bioc); - - if (!dev || !dev->bdev || - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - ret = -EIO; - goto out_counter_dec; - } - - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); - bio.bi_iter.bi_sector = sector; - __bio_add_page(&bio, page, length, pg_offset); - - btrfsic_check_bio(&bio); - ret = submit_bio_wait(&bio); - if (ret) { - /* try to remap that extent elsewhere? */ - btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - goto out_bio_uninit; - } - - btrfs_info_rl_in_rcu(fs_info, - "read error corrected: ino %llu off %llu (dev %s sector %llu)", - ino, start, btrfs_dev_name(dev), sector); - ret = 0; - -out_bio_uninit: - bio_uninit(&bio); -out_counter_dec: - btrfs_bio_counter_dec(fs_info); - return ret; -} - -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - u64 start = eb->start; - int i, num_pages = num_extent_pages(eb); - int ret = 0; - - if (sb_rdonly(fs_info->sb)) - return -EROFS; - - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p, - start - page_offset(p), mirror_num); - if (ret) - break; - start += PAGE_SIZE; - } - - return ret; -} - static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) { if (cur_mirror == failrec->num_copies) @@ -691,7 +578,7 @@ int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start, mirror = failrec->this_mirror; do { mirror = prev_mirror(failrec, mirror); - repair_io_failure(fs_info, ino, start, failrec->len, + btrfs_repair_io_failure(fs_info, ino, start, failrec->len, failrec->logical, page, pg_offset, mirror); } while (mirror != failrec->failed_mirror); @@ -822,7 +709,7 @@ int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_ * * Since we're only doing repair for one sector, we only need to get * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. + * everything for btrfs_repair_io_failure to do the rest for us. */ failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); if (failrec->this_mirror == failrec->failed_mirror) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a0bafc7f6c07..9b486419854e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -245,7 +245,6 @@ int extent_invalidate_folio(struct extent_io_tree *tree, int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); -int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num); /* * When IO fails, either with EIO or csum verification fails, we From 1fe5ebc4e17dfbffeefe52abab75b02ad0f3a97e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:10 -0500 Subject: [PATCH 229/249] btrfs: move root helpers back into ctree.h These accidentally got brought into accessors.h, but belong with the btrfs_root definitions which are currently in ctree.h. Move these to make it easier to sync accessors.[ch] into btrfs-progs. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 17 ----------------- fs/btrfs/ctree.h | 17 +++++++++++++++++ 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index cb59b69d2af1..57ba6894a5f4 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -712,23 +712,6 @@ BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item, otransid, 64); BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item, stransid, 64); BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item, rtransid, 64); -static inline bool btrfs_root_readonly(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; -} - -static inline bool btrfs_root_dead(const struct btrfs_root *root) -{ - /* Byte-swap the constant at compile time, root_item::flags is LE */ - return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; -} - -static inline u64 btrfs_root_id(const struct btrfs_root *root) -{ - return root->root_key.objectid; -} - /* struct btrfs_root_backup */ BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup, tree_root, 64); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 99defab7e1f6..4e58b0532b36 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -335,6 +335,23 @@ struct btrfs_root { #endif }; +static inline bool btrfs_root_readonly(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_RDONLY)) != 0; +} + +static inline bool btrfs_root_dead(const struct btrfs_root *root) +{ + /* Byte-swap the constant at compile time, root_item::flags is LE */ + return (root->root_item.flags & cpu_to_le64(BTRFS_ROOT_SUBVOL_DEAD)) != 0; +} + +static inline u64 btrfs_root_id(const struct btrfs_root *root) +{ + return root->root_key.objectid; +} + /* * Structure that conveys information about an extent that is going to replace * all the extents in a file range. From 3a3178c7f7675661d7bd0c67f57420e20982bd34 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:11 -0500 Subject: [PATCH 230/249] btrfs: move leaf_data_end into ctree.c This is only used in ctree.c, with the exception of zero'ing out extent buffers we're getting ready to write out. In theory we shouldn't have an extent buffer with 0 items that we're writing out, however I'd rather be safe than sorry so open code it in extent_io.c, and then copy the helper into ctree.c. This will make it easier to sync accessors.[ch] into btrfs-progs, as this requires a helper that isn't defined in accessors.h. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 13 ------------- fs/btrfs/ctree.c | 13 +++++++++++++ fs/btrfs/extent_io.c | 6 +++++- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 57ba6894a5f4..b9d9a69685df 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -897,19 +897,6 @@ const char *btrfs_super_csum_name(u16 csum_type); const char *btrfs_super_csum_driver(u16 csum_type); size_t __attribute_const__ btrfs_get_num_csums(void); -/* - * The leaf data grows from end-to-front in the node. this returns the address - * of the start of the last item, which is the stop of the leaf data stack. - */ -static inline unsigned int leaf_data_end(const struct extent_buffer *leaf) -{ - u32 nr = btrfs_header_nritems(leaf); - - if (nr == 0) - return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); - return btrfs_item_offset(leaf, nr - 1); -} - /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index f75e398d7b71..dc38c24a0ffa 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -51,6 +51,19 @@ static const struct btrfs_csums { .driver = "blake2b-256" }, }; +/* + * The leaf data grows from end-to-front in the node. this returns the address + * of the start of the last item, which is the stop of the leaf data stack. + */ +static unsigned int leaf_data_end(const struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(leaf->fs_info); + return btrfs_item_offset(leaf, nr - 1); +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 8528e7d3f38f..9fc9f8068069 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2537,7 +2537,11 @@ static void prepare_eb_write(struct extent_buffer *eb) * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(nritems); - end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb); + end = BTRFS_LEAF_DATA_OFFSET; + if (nritems == 0) + end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); + else + end += btrfs_item_offset(eb, nritems - 1); memzero_extent_buffer(eb, start, end - start); } } From 6bfd0ffa6f2ae0ead92af7c4521626cd456115c5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:12 -0500 Subject: [PATCH 231/249] btrfs: move file_extent_item helpers into file-item.h These helpers use functions that are in multiple places, which makes it tricky to sync them into btrfs-progs. Move them to file-item.h and then include file-item.h in places that use these helpers. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 22 ---------------------- fs/btrfs/ctree.c | 1 + fs/btrfs/ctree.h | 8 -------- fs/btrfs/file-item.h | 33 +++++++++++++++++++++++++++++++++ fs/btrfs/tree-checker.c | 1 + 5 files changed, 35 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index b9d9a69685df..f0d017f9407f 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -915,16 +915,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_num_bytes, BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, struct btrfs_file_extent_item, compression, 8); -static inline unsigned long btrfs_file_extent_inline_start( - const struct btrfs_file_extent_item *e) -{ - return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - -static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) -{ - return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; -} BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, @@ -946,18 +936,6 @@ BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, other_encoding, 16); -/* - * Returns the number of bytes used by the item on disk, minus the size of any - * extent headers. If a file is compressed on disk, this is the compressed - * size. - */ -static inline u32 btrfs_file_extent_inline_item_len( - const struct extent_buffer *eb, - int nr) -{ - return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - /* btrfs_qgroup_status_item */ BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item, generation, 64); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index dc38c24a0ffa..e85b243be5a2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -22,6 +22,7 @@ #include "accessors.h" #include "extent-tree.h" #include "relocation.h" +#include "file-item.h" static struct kmem_cache *btrfs_path_cachep; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 4e58b0532b36..428f51efd455 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -461,14 +461,6 @@ static inline u32 BTRFS_NODEPTRS_PER_BLOCK(const struct btrfs_fs_info *info) return BTRFS_LEAF_DATA_SIZE(info) / sizeof(struct btrfs_key_ptr); } -#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ - (offsetof(struct btrfs_file_extent_item, disk_bytenr)) -static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) -{ - return BTRFS_MAX_ITEM_SIZE(info) - - BTRFS_FILE_EXTENT_INLINE_DATA_START; -} - static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) { return BTRFS_MAX_ITEM_SIZE(info) - sizeof(struct btrfs_dir_item); diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 95b371208b30..031225668434 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,6 +3,39 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include "accessors.h" + +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) + +static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info) +{ + return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +/* + * Return the number of bytes used by the item on disk, minus the size of any + * extent headers. If a file is compressed on disk, this is the compressed + * size. + */ +static inline u32 btrfs_file_extent_inline_item_len( + const struct extent_buffer *eb, + int nr) +{ + return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline unsigned long btrfs_file_extent_inline_start( + const struct btrfs_file_extent_item *e) +{ + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; +} + int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 1c2d418dda6a..32e051101a27 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -28,6 +28,7 @@ #include "btrfs_inode.h" #include "fs.h" #include "accessors.h" +#include "file-item.h" /* * Error message should follow the following format: From 9b48addac406bd10afe447c2182b241dbde1d1a6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:13 -0500 Subject: [PATCH 232/249] btrfs: move eb offset helpers into extent_io.h These are very specific to how the extent buffer is defined, so this differs between btrfs-progs and the kernel. Make things easier by moving these helpers into extent_io.h so we don't have to worry about this when syncing ctree.h. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 33 --------------------------------- fs/btrfs/extent_io.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 428f51efd455..6169d3b28475 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -686,39 +686,6 @@ static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p) } int btrfs_leaf_free_space(struct extent_buffer *leaf); -/* - * Get the correct offset inside the page of extent buffer. - * - * @eb: target extent buffer - * @start: offset inside the extent buffer - * - * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. - */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. - * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. - */ - return offset_in_page(offset + eb->start); -} - -static inline unsigned long get_eb_page_index(unsigned long offset) -{ - /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. - * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. - */ - return offset >> PAGE_SHIFT; -} - static inline int is_fstree(u64 rootid) { if (rootid == BTRFS_FS_TREE_OBJECTID || diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9b486419854e..a2c82448b2e0 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -95,6 +95,39 @@ struct extent_buffer { #endif }; +/* + * Get the correct offset inside the page of extent buffer. + * + * @eb: target extent buffer + * @start: offset inside the extent buffer + * + * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. + */ +static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, + unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, eb->start will always be aligned + * to PAGE_SIZE, thus adding it won't cause any difference. + * + * For sectorsize < PAGE_SIZE, we must only read the data that belongs + * to the eb, thus we have to take the eb->start into consideration. + */ + return offset_in_page(offset + eb->start); +} + +static inline unsigned long get_eb_page_index(unsigned long offset) +{ + /* + * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * + * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, + * and have ensured that all tree blocks are contained in one page, + * thus we always get index == 0. + */ + return offset >> PAGE_SHIFT; +} + /* * Structure to record how many bytes and which ranges are set/cleared */ From 0e6c40ebbb18b32cf4b5fcb3173eaede89b9f440 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:14 -0500 Subject: [PATCH 233/249] btrfs: move the csum helpers into ctree.h These got moved because of copy+paste, but this code exists in ctree.c, so move the declarations back into ctree.h. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 5 ----- fs/btrfs/ctree.h | 5 +++++ 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index f0d017f9407f..066a662f38c3 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -892,11 +892,6 @@ BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); -int btrfs_super_csum_size(const struct btrfs_super_block *s); -const char *btrfs_super_csum_name(u16 csum_type); -const char *btrfs_super_csum_driver(u16 csum_type); -size_t __attribute_const__ btrfs_get_num_csums(void); - /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, type, 8); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6169d3b28475..6965703a81b6 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -700,6 +700,11 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) return root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID; } +int btrfs_super_csum_size(const struct btrfs_super_block *s); +const char *btrfs_super_csum_name(u16 csum_type); +const char *btrfs_super_csum_driver(u16 csum_type); +size_t __attribute_const__ btrfs_get_num_csums(void); + /* * We use page status Private2 to indicate there is an ordered extent with * unfinished IO. From 42c9419a4c01910e9c46b0c2bb9090f76295bf01 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:15 -0500 Subject: [PATCH 234/249] btrfs: pass the extent buffer for the btrfs_item_nr helpers This is actually a change for extent tree v2, but it exists in btrfs-progs but not in the kernel. This makes it annoying to sync accessors.h with btrfs-progs, and since this is the way I need it for extent-tree v2 simply update these helpers to take the extent buffer in order to make syncing possible now, and make the extent tree v2 stuff easier moving forward. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 18 +++++++++--------- fs/btrfs/ctree.c | 35 ++++++++++++++++++----------------- fs/btrfs/extent_io.c | 2 +- fs/btrfs/tree-checker.c | 4 ++-- 4 files changed, 30 insertions(+), 29 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 066a662f38c3..2b4fb696142b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -417,37 +417,37 @@ BTRFS_SETGET_FUNCS(raw_item_size, struct btrfs_item, size, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32); BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32); -static inline unsigned long btrfs_item_nr_offset(int nr) +static inline unsigned long btrfs_item_nr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_leaf, items) + sizeof(struct btrfs_item) * nr; } -static inline struct btrfs_item *btrfs_item_nr(int nr) +static inline struct btrfs_item *btrfs_item_nr(const struct extent_buffer *eb, int nr) { - return (struct btrfs_item *)btrfs_item_nr_offset(nr); + return (struct btrfs_item *)btrfs_item_nr_offset(eb, nr); } #define BTRFS_ITEM_SETGET_FUNCS(member) \ static inline u32 btrfs_item_##member(const struct extent_buffer *eb, int slot) \ { \ - return btrfs_raw_item_##member(eb, btrfs_item_nr(slot)); \ + return btrfs_raw_item_##member(eb, btrfs_item_nr(eb, slot)); \ } \ static inline void btrfs_set_item_##member(const struct extent_buffer *eb, \ int slot, u32 val) \ { \ - btrfs_set_raw_item_##member(eb, btrfs_item_nr(slot), val); \ + btrfs_set_raw_item_##member(eb, btrfs_item_nr(eb, slot), val); \ } \ static inline u32 btrfs_token_item_##member(struct btrfs_map_token *token, \ int slot) \ { \ - struct btrfs_item *item = btrfs_item_nr(slot); \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ return btrfs_token_raw_item_##member(token, item); \ } \ static inline void btrfs_set_token_item_##member(struct btrfs_map_token *token, \ int slot, u32 val) \ { \ - struct btrfs_item *item = btrfs_item_nr(slot); \ + struct btrfs_item *item = btrfs_item_nr(token->eb, slot); \ btrfs_set_token_raw_item_##member(token, item, val); \ } @@ -462,7 +462,7 @@ static inline u32 btrfs_item_data_end(const struct extent_buffer *eb, int nr) static inline void btrfs_item_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(nr); + struct btrfs_item *item = btrfs_item_nr(eb, nr); read_eb_member(eb, item, struct btrfs_item, key, disk_key); } @@ -470,7 +470,7 @@ static inline void btrfs_item_key(const struct extent_buffer *eb, static inline void btrfs_set_item_key(struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - struct btrfs_item *item = btrfs_item_nr(nr); + struct btrfs_item *item = btrfs_item_nr(eb, nr); write_eb_member(eb, item, struct btrfs_item, key, disk_key); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e85b243be5a2..cdc112d3bab2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3033,13 +3033,13 @@ static noinline int __push_leaf_right(struct btrfs_path *path, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), - btrfs_item_nr_offset(0), + memmove_extent_buffer(right, btrfs_item_nr_offset(right, push_items), + btrfs_item_nr_offset(right, 0), right_nritems * sizeof(struct btrfs_item)); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(left_nritems - push_items), + copy_extent_buffer(right, left, btrfs_item_nr_offset(right, 0), + btrfs_item_nr_offset(left, left_nritems - push_items), push_items * sizeof(struct btrfs_item)); /* update the item pointers */ @@ -3233,8 +3233,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, /* push data from right to left */ copy_extent_buffer(left, right, - btrfs_item_nr_offset(btrfs_header_nritems(left)), - btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left, btrfs_header_nritems(left)), + btrfs_item_nr_offset(right, 0), push_items * sizeof(struct btrfs_item)); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - @@ -3272,8 +3272,8 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, BTRFS_LEAF_DATA_OFFSET + leaf_data_end(right), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(push_items), + memmove_extent_buffer(right, btrfs_item_nr_offset(right, 0), + btrfs_item_nr_offset(left, push_items), (btrfs_header_nritems(right) - push_items) * sizeof(struct btrfs_item)); } @@ -3407,8 +3407,8 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(0), - btrfs_item_nr_offset(mid), + copy_extent_buffer(right, l, btrfs_item_nr_offset(right, 0), + btrfs_item_nr_offset(l, mid), nritems * sizeof(struct btrfs_item)); copy_extent_buffer(right, l, @@ -3784,8 +3784,8 @@ static noinline int split_item(struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot + 1), + btrfs_item_nr_offset(leaf, slot), (nritems - slot) * sizeof(struct btrfs_item)); } @@ -4077,9 +4077,10 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_extent_buffer(leaf, + btrfs_item_nr_offset(leaf, slot + batch->nr), + btrfs_item_nr_offset(leaf, slot), + (nritems - slot) * sizeof(struct btrfs_item)); /* shift the data */ memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + @@ -4333,8 +4334,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_token_item_offset(&token, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), - btrfs_item_nr_offset(slot + nr), + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot), + btrfs_item_nr_offset(leaf, slot + nr), sizeof(struct btrfs_item) * (nritems - slot - nr)); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9fc9f8068069..f1df13c5fc6f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2536,7 +2536,7 @@ static void prepare_eb_write(struct extent_buffer *eb) * Leaf: * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ - start = btrfs_item_nr_offset(nritems); + start = btrfs_item_nr_offset(eb, nritems); end = BTRFS_LEAF_DATA_OFFSET; if (nritems == 0) end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 32e051101a27..baad1ed7e111 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1784,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data) /* Also check if the item pointer overlaps with btrfs item. */ if (unlikely(btrfs_item_ptr_offset(leaf, slot) < - btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) { + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) { generic_err(leaf, slot, "slot overlaps with its data, item end %lu data start %lu", - btrfs_item_nr_offset(slot) + + btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item), btrfs_item_ptr_offset(leaf, slot)); return -EUCLEAN; From e23efd8e8767165a6103cf0a4fe273f6b9f182f2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:16 -0500 Subject: [PATCH 235/249] btrfs: add eb to btrfs_node_key_ptr_offset This is a change needed for extent tree v2, as we will be growing the header size. This exists in btrfs-progs currently, and not having it makes syncing accessors.[ch] more problematic. So make this change to set us up for extent tree v2 and match what btrfs-progs does to make syncing easier. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.c | 2 +- fs/btrfs/accessors.h | 4 ++-- fs/btrfs/ctree.c | 28 ++++++++++++++-------------- fs/btrfs/extent_io.c | 2 +- fs/btrfs/tree-mod-log.c | 4 ++-- 5 files changed, 20 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 7a7b7d263102..206cf1612c1d 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -168,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64) void btrfs_node_key(const struct extent_buffer *eb, struct btrfs_disk_key *disk_key, int nr) { - unsigned long ptr = btrfs_node_key_ptr_offset(nr); + unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr); read_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 2b4fb696142b..88eea44fdd7f 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -392,7 +392,7 @@ static inline void btrfs_set_node_ptr_generation(const struct extent_buffer *eb, btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); } -static inline unsigned long btrfs_node_key_ptr_offset(int nr) +static inline unsigned long btrfs_node_key_ptr_offset(const struct extent_buffer *eb, int nr) { return offsetof(struct btrfs_node, ptrs) + sizeof(struct btrfs_key_ptr) * nr; @@ -406,7 +406,7 @@ static inline void btrfs_set_node_key(const struct extent_buffer *eb, { unsigned long ptr; - ptr = btrfs_node_key_ptr_offset(nr); + ptr = btrfs_node_key_ptr_offset(eb, nr); write_eb_member(eb, (struct btrfs_key_ptr *)ptr, struct btrfs_key_ptr, key, disk_key); } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cdc112d3bab2..32facf8d7319 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2612,8 +2612,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(dst_nritems), - btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(dst, dst_nritems), + btrfs_node_key_ptr_offset(src, 0), push_items * sizeof(struct btrfs_key_ptr)); if (push_items < src_nritems) { @@ -2621,8 +2621,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, * Don't call btrfs_tree_mod_log_insert_move() here, key removal * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ - memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(push_items), + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0), + btrfs_node_key_ptr_offset(src, push_items), (src_nritems - push_items) * sizeof(struct btrfs_key_ptr)); } @@ -2682,8 +2682,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, } ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); - memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), - btrfs_node_key_ptr_offset(0), + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items), + btrfs_node_key_ptr_offset(dst, 0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); @@ -2694,8 +2694,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(dst, src, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(src_nritems - push_items), + btrfs_node_key_ptr_offset(dst, 0), + btrfs_node_key_ptr_offset(src, src_nritems - push_items), push_items * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(src, src_nritems - push_items); @@ -2798,8 +2798,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(ret < 0); } memmove_extent_buffer(lower, - btrfs_node_key_ptr_offset(slot + 1), - btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(lower, slot + 1), + btrfs_node_key_ptr_offset(lower, slot), (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { @@ -2881,8 +2881,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, return ret; } copy_extent_buffer(split, c, - btrfs_node_key_ptr_offset(0), - btrfs_node_key_ptr_offset(mid), + btrfs_node_key_ptr_offset(split, 0), + btrfs_node_key_ptr_offset(c, mid), (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); btrfs_set_header_nritems(split, c_nritems - mid); btrfs_set_header_nritems(c, mid); @@ -4240,8 +4240,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, BUG_ON(ret < 0); } memmove_extent_buffer(parent, - btrfs_node_key_ptr_offset(slot), - btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(parent, slot), + btrfs_node_key_ptr_offset(parent, slot + 1), sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f1df13c5fc6f..f1d5a851968f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2529,7 +2529,7 @@ static void prepare_eb_write(struct extent_buffer *eb) /* Set btree blocks beyond nritems with 0 to avoid stale content */ nritems = btrfs_header_nritems(eb); if (btrfs_header_level(eb) > 0) { - end = btrfs_node_key_ptr_offset(nritems); + end = btrfs_node_key_ptr_offset(eb, nritems); memzero_extent_buffer(eb, end, eb->len - end); } else { /* diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 146a6b198933..a555baa0143a 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -697,8 +697,8 @@ static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, n--; break; case BTRFS_MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + o_dst = btrfs_node_key_ptr_offset(eb, tm->slot); + o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot); memmove_extent_buffer(eb, o_dst, o_src, tm->move.nr_items * p_size); break; From 637e3b48c22e52daa645b49b73630af3f09328ae Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:17 -0500 Subject: [PATCH 236/249] btrfs: add helpers for manipulating leaf items and data We have some gnarly memmove and copy_extent_buffer calls for leaf manipulation. This is because our item offsets aren't absolute, they're based on 0 being where the items start in the leaf, which is after the btrfs_header. This means any manipulation of the data requires adding sizeof(struct btrfs_header) to the offsets we pull from the items. Moving the items themselves is easier as the helpers are absolute offsets, however we of course have to call the helpers to get the offsets for the item numbers. This makes for copy_extent_buffer/memmove_extent_buffer calls that are kind of hard to reason about what's happening. Fix this by pushing this logic into helpers. For data we'll only use the item provided offsets, and the helpers will use the BTRFS_LEAF_DATA_OFFSET addition for the offsets. Additionally for the item manipulation simply pass in the item numbers, and then the helpers will call the offset helper to get the actual offset into the leaf. The diffstat makes this look like more code, but that's simply because I added comments for the helpers, it's net negative for the amount of code, and is easier to reason. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 181 ++++++++++++++++++++++++++++++----------------- 1 file changed, 115 insertions(+), 66 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 32facf8d7319..b5d203045dc7 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -65,6 +65,91 @@ static unsigned int leaf_data_end(const struct extent_buffer *leaf) return btrfs_item_offset(leaf, nr - 1); } +/* + * Move data in a @leaf (using memmove, safe for overlapping ranges). + * + * @leaf: leaf that we're doing a memmove on + * @dst_offset: item data offset we're moving to + * @src_offset: item data offset were' moving from + * @len: length of the data we're moving + * + * Wrapper around memmove_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void memmove_leaf_data(const struct extent_buffer *leaf, + unsigned long dst_offset, + unsigned long src_offset, + unsigned long len) +{ + memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + dst_offset, + BTRFS_LEAF_DATA_OFFSET + src_offset, len); +} + +/* + * Copy item data from @src into @dst at the given @offset. + * + * @dst: destination leaf that we're copying into + * @src: source leaf that we're copying from + * @dst_offset: item data offset we're copying to + * @src_offset: item data offset were' copying from + * @len: length of the data we're copying + * + * Wrapper around copy_extent_buffer() that takes into account the header on + * the leaf. The btrfs_item offset's start directly after the header, so we + * have to adjust any offsets to account for the header in the leaf. This + * handles that math to simplify the callers. + */ +static inline void copy_leaf_data(const struct extent_buffer *dst, + const struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + copy_extent_buffer(dst, src, BTRFS_LEAF_DATA_OFFSET + dst_offset, + BTRFS_LEAF_DATA_OFFSET + src_offset, len); +} + +/* + * Move items in a @leaf (using memmove). + * + * @dst: destination leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around memmove_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void memmove_leaf_items(const struct extent_buffer *leaf, + int dst_item, int src_item, int nr_items) +{ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item), + btrfs_item_nr_offset(leaf, src_item), + nr_items * sizeof(struct btrfs_item)); +} + +/* + * Copy items from @src into @dst at the given @offset. + * + * @dst: destination leaf for the items + * @src: source leaf for the items + * @dst_item: the item nr we're copying into + * @src_item: the item nr we're copying from + * @nr_items: the number of items to copy + * + * Wrapper around copy_extent_buffer() that does the math to get the + * appropriate offsets into the leaf from the item numbers. + */ +static inline void copy_leaf_items(const struct extent_buffer *dst, + const struct extent_buffer *src, + int dst_item, int src_item, int nr_items) +{ + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item), + btrfs_item_nr_offset(src, src_item), + nr_items * sizeof(struct btrfs_item)); +} + int btrfs_super_csum_size(const struct btrfs_super_block *s) { u16 t = btrfs_super_csum_type(s); @@ -3022,25 +3107,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path, /* make room in the right data area */ data_end = leaf_data_end(right); - memmove_extent_buffer(right, - BTRFS_LEAF_DATA_OFFSET + data_end - push_space, - BTRFS_LEAF_DATA_OFFSET + data_end, - BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); + memmove_leaf_data(right, data_end - push_space, data_end, + BTRFS_LEAF_DATA_SIZE(fs_info) - data_end); /* copy from the left data area */ - copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left), - push_space); + copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(left), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(right, push_items), - btrfs_item_nr_offset(right, 0), - right_nritems * sizeof(struct btrfs_item)); + memmove_leaf_items(right, push_items, 0, right_nritems); /* copy the items from left to right */ - copy_extent_buffer(right, left, btrfs_item_nr_offset(right, 0), - btrfs_item_nr_offset(left, left_nritems - push_items), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(right, left, 0, left_nritems - push_items, push_items); /* update the item pointers */ btrfs_init_map_token(&token, right); @@ -3232,19 +3309,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, WARN_ON(!empty && push_items == btrfs_header_nritems(right)); /* push data from right to left */ - copy_extent_buffer(left, right, - btrfs_item_nr_offset(left, btrfs_header_nritems(left)), - btrfs_item_nr_offset(right, 0), - push_items * sizeof(struct btrfs_item)); + copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items); push_space = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_offset(right, push_items - 1); - copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(left) - push_space, - BTRFS_LEAF_DATA_OFFSET + - btrfs_item_offset(right, push_items - 1), - push_space); + copy_leaf_data(left, right, leaf_data_end(left) - push_space, + btrfs_item_offset(right, push_items - 1), push_space); old_left_nritems = btrfs_header_nritems(left); BUG_ON(old_left_nritems <= 0); @@ -3267,15 +3338,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size, if (push_items < right_nritems) { push_space = btrfs_item_offset(right, push_items - 1) - leaf_data_end(right); - memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET + - BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, - BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(right), push_space); + memmove_leaf_data(right, + BTRFS_LEAF_DATA_SIZE(fs_info) - push_space, + leaf_data_end(right), push_space); - memmove_extent_buffer(right, btrfs_item_nr_offset(right, 0), - btrfs_item_nr_offset(left, push_items), - (btrfs_header_nritems(right) - push_items) * - sizeof(struct btrfs_item)); + memmove_leaf_items(right, 0, push_items, + btrfs_header_nritems(right) - push_items); } btrfs_init_map_token(&token, right); @@ -3407,14 +3475,10 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans, btrfs_set_header_nritems(right, nritems); data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l); - copy_extent_buffer(right, l, btrfs_item_nr_offset(right, 0), - btrfs_item_nr_offset(l, mid), - nritems * sizeof(struct btrfs_item)); + copy_leaf_items(right, l, 0, mid, nritems); - copy_extent_buffer(right, l, - BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) - - data_copy_size, BTRFS_LEAF_DATA_OFFSET + - leaf_data_end(l), data_copy_size); + copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size, + leaf_data_end(l), data_copy_size); rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid); @@ -3784,9 +3848,7 @@ static noinline int split_item(struct btrfs_path *path, nritems = btrfs_header_nritems(leaf); if (slot != nritems) { /* shift the items */ - memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot + 1), - btrfs_item_nr_offset(leaf, slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + 1, slot, nritems - slot); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3897,9 +3959,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) /* shift the data */ if (from_end) { - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start + new_size - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start + new_size - data_end); } else { struct btrfs_disk_key disk_key; u64 offset; @@ -3924,9 +3985,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end) } } - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + size_diff, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data_start - data_end); + memmove_leaf_data(leaf, data_end + size_diff, data_end, + old_data_start - data_end); offset = btrfs_disk_key_offset(&disk_key); btrfs_set_disk_key_offset(&disk_key, offset + size_diff); @@ -3991,9 +4051,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - data_size, BTRFS_LEAF_DATA_OFFSET + - data_end, old_data - data_end); + memmove_leaf_data(leaf, data_end - data_size, data_end, + old_data - data_end); data_end = old_data; old_size = btrfs_item_size(leaf, slot); @@ -4077,16 +4136,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p ioff - batch->total_data_size); } /* shift the items */ - memmove_extent_buffer(leaf, - btrfs_item_nr_offset(leaf, slot + batch->nr), - btrfs_item_nr_offset(leaf, slot), - (nritems - slot) * sizeof(struct btrfs_item)); + memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot); /* shift the data */ - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end - batch->total_data_size, - BTRFS_LEAF_DATA_OFFSET + data_end, - old_data - data_end); + memmove_leaf_data(leaf, data_end - batch->total_data_size, + data_end, old_data - data_end); data_end = old_data; } @@ -4321,10 +4375,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, for (i = 0; i < nr; i++) dsize += btrfs_item_size(leaf, slot + i); - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + - data_end + dsize, - BTRFS_LEAF_DATA_OFFSET + data_end, - last_off - data_end); + memmove_leaf_data(leaf, data_end + dsize, data_end, + last_off - data_end); btrfs_init_map_token(&token, leaf); for (i = slot + nr; i < nritems; i++) { @@ -4334,10 +4386,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_token_item_offset(&token, i, ioff + dsize); } - memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, slot), - btrfs_item_nr_offset(leaf, slot + nr), - sizeof(struct btrfs_item) * - (nritems - slot - nr)); + memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr); } btrfs_set_header_nritems(leaf, nritems - nr); nritems -= nr; From 8009adf306452e9b43db36f2d02fedfe5eca1b5e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:18 -0500 Subject: [PATCH 237/249] btrfs: remove BTRFS_LEAF_DATA_OFFSET This is simply the same thing as btrfs_item_nr_offset(leaf, 0), so remove this helper and replace it's usage with the above statement. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 6 ++---- fs/btrfs/ctree.c | 8 ++++---- fs/btrfs/extent_io.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 88eea44fdd7f..e6228ff73c81 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -9,8 +9,6 @@ struct btrfs_map_token { unsigned long offset; }; -#define BTRFS_LEAF_DATA_OFFSET offsetof(struct btrfs_leaf, items) - void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb); /* @@ -1028,9 +1026,9 @@ BTRFS_SETGET_STACK_FUNCS(stack_verity_descriptor_size, /* Cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ - ((type *)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + ((type *)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #define btrfs_item_ptr_offset(leaf, slot) \ - ((unsigned long)(BTRFS_LEAF_DATA_OFFSET + btrfs_item_offset(leaf, slot))) + ((unsigned long)(btrfs_item_nr_offset(leaf, 0) + btrfs_item_offset(leaf, slot))) #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index b5d203045dc7..76b99bcc849d 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -83,8 +83,8 @@ static inline void memmove_leaf_data(const struct extent_buffer *leaf, unsigned long src_offset, unsigned long len) { - memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + dst_offset, - BTRFS_LEAF_DATA_OFFSET + src_offset, len); + memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset, + btrfs_item_nr_offset(leaf, 0) + src_offset, len); } /* @@ -106,8 +106,8 @@ static inline void copy_leaf_data(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - copy_extent_buffer(dst, src, BTRFS_LEAF_DATA_OFFSET + dst_offset, - BTRFS_LEAF_DATA_OFFSET + src_offset, len); + copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset, + btrfs_item_nr_offset(src, 0) + src_offset, len); } /* diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f1d5a851968f..83dd3aa59663 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2537,7 +2537,7 @@ static void prepare_eb_write(struct extent_buffer *eb) * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0 */ start = btrfs_item_nr_offset(eb, nritems); - end = BTRFS_LEAF_DATA_OFFSET; + end = btrfs_item_nr_offset(eb, 0); if (nritems == 0) end += BTRFS_LEAF_DATA_SIZE(eb->fs_info); else From 0c7030038e6106711c5d0b237c980905dd3244ec Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:19 -0500 Subject: [PATCH 238/249] btrfs: add nr_global_roots to the super block definition We already have this defined in btrfs-progs, add it to the kernel to make it easier to sync these files into btrfs-progs. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 2 ++ include/uapi/linux/btrfs_tree.h | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index e6228ff73c81..75c181b579eb 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -889,6 +889,8 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_nr_global_roots, struct btrfs_super_block, + nr_global_roots, 64); /* struct btrfs_file_extent_item */ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_type, struct btrfs_file_extent_item, diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 29895ffa470d..ab38d0f411fa 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -688,8 +688,9 @@ struct btrfs_super_block { /* The UUID written into btree blocks */ __u8 metadata_uuid[BTRFS_FSID_SIZE]; + __u64 nr_global_roots; + /* Future expansion */ - __u8 reserved8[8]; __le64 reserved[27]; __u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; From 054056bd0a8da2747e51646a6ac5af6ffbae5b69 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 15 Nov 2022 11:16:20 -0500 Subject: [PATCH 239/249] btrfs: add stack helpers for a few btrfs items We don't have these defined in the kernel because we don't have any users of these helpers. However we do use them in btrfs-progs, so define them to make keeping accessors.h in sync between progs and the kernel easier. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/accessors.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 75c181b579eb..ceadfc5d6c66 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -221,12 +221,26 @@ static inline u64 btrfs_stripe_offset_nr(const struct extent_buffer *eb, return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); } +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + static inline u64 btrfs_stripe_devid_nr(const struct extent_buffer *eb, struct btrfs_chunk *c, int nr) { return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); } +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + /* struct btrfs_block_group_item */ BTRFS_SETGET_STACK_FUNCS(stack_block_group_used, struct btrfs_block_group_item, used, 64); @@ -248,6 +262,8 @@ BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_inode_ref_index, struct btrfs_inode_ref, index, 64); /* struct btrfs_inode_extref */ BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref, @@ -297,6 +313,14 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, chunk_offset, 64); BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_STACK_FUNCS(stack_dev_extent_length, struct btrfs_dev_extent, length, 64); + BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64); BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item, generation, 64); BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64); @@ -479,6 +503,9 @@ BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_STACK_FUNCS(stack_root_ref_name_len, struct btrfs_root_ref, name_len, 16); /* struct btrfs_dir_item */ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); @@ -972,6 +999,16 @@ BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item, rsv_rfer, 64); BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item, rsv_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_flags, + struct btrfs_qgroup_limit_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_rfer, + struct btrfs_qgroup_limit_item, max_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_max_excl, + struct btrfs_qgroup_limit_item, max_excl, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_rfer, + struct btrfs_qgroup_limit_item, rsv_rfer, 64); +BTRFS_SETGET_STACK_FUNCS(stack_qgroup_limit_rsv_excl, + struct btrfs_qgroup_limit_item, rsv_excl, 64); /* btrfs_dev_replace_item */ BTRFS_SETGET_FUNCS(dev_replace_src_devid, From a4c853af0c511d7e0f7cb306bbc8a4f1dbdb64ca Mon Sep 17 00:00:00 2001 From: ChenXiaoSong Date: Wed, 16 Nov 2022 22:23:53 +0800 Subject: [PATCH 240/249] btrfs: add might_sleep() annotations Add annotations to functions that might sleep due to allocations or IO and could be called from various contexts. In case of btrfs_search_slot it's not obvious why it would sleep: btrfs_search_slot setup_nodes_for_search reada_for_balance btrfs_readahead_node_child btrfs_readahead_tree_block btrfs_find_create_tree_block alloc_extent_buffer kmem_cache_zalloc /* allocate memory non-atomically, might sleep */ kmem_cache_alloc(GFP_NOFS|__GFP_NOFAIL|__GFP_ZERO) read_extent_buffer_pages submit_extent_page /* disk IO, might sleep */ submit_one_bio Other examples where the sleeping could happen is in 3 places might sleep in update_qgroup_limit_item(), as shown below: update_qgroup_limit_item btrfs_alloc_path /* allocate memory non-atomically, might sleep */ kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS) Signed-off-by: ChenXiaoSong Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 76b99bcc849d..4754c9101a4c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -184,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void) struct btrfs_path *btrfs_alloc_path(void) { + might_sleep(); + return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); } @@ -2046,6 +2048,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, int min_write_lock_level; int prev_cmp; + might_sleep(); + lowest_level = p->lowest_level; WARN_ON(lowest_level && ins_len > 0); WARN_ON(p->nodes[0] != NULL); From d7c9e1be2876f63fb2178a24e0c1d5733ff98d47 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Nov 2022 15:06:09 -0500 Subject: [PATCH 241/249] btrfs: fix uninitialized parent in insert_state I don't know how this isn't caught when we build this in the kernel, but while syncing extent-io-tree.c into btrfs-progs I got an error because parent could potentially be uninitialized when we link in a new node, specifically when the extent_io_tree is empty. This means we could have garbage in the parent color. I don't know what the ramifications are of that, but it's probably not great, so fix this by initializing parent to NULL. I spot checked all of our other usages in btrfs and we appear to be doing the correct thing everywhere else. Fixes: c7e118cf98c7 ("btrfs: open code rbtree search in insert_state") CC: stable@vger.kernel.org # 6.0+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 21fa15123af8..82ca6a11e11a 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -395,7 +395,7 @@ static int insert_state(struct extent_io_tree *tree, u32 bits, struct extent_changeset *changeset) { struct rb_node **node; - struct rb_node *parent; + struct rb_node *parent = NULL; const u64 end = state->end; set_state_bits(tree, state, bits, changeset); From 26df39a9e5a8985674e814f0b27b25e8b4eb9ba7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Nov 2022 15:09:42 -0500 Subject: [PATCH 242/249] btrfs: fix uninitialized variable in find_first_clear_extent_bit This was caught when syncing extent-io-tree.c into btrfs-progs. This however isn't really a problem, the only way next would be uninitialized is if we found the range we were looking for, and in this case we don't care about next. However it's a compile error, so fix it up. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 82ca6a11e11a..9ae9cd1e7035 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -1425,7 +1425,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - struct extent_state *prev = NULL, *next; + struct extent_state *prev = NULL, *next = NULL; spin_lock(&tree->lock); From 63d5429f68a3d4c4aa27e65a05196c17f86c41d6 Mon Sep 17 00:00:00 2001 From: Artem Chernyshev Date: Sat, 19 Nov 2022 11:13:29 +0300 Subject: [PATCH 243/249] btrfs: replace strncpy() with strscpy() Using strncpy() on NUL-terminated strings are deprecated. To avoid possible forming of non-terminated string strscpy() should be used. Found by Linux Verification Center (linuxtesting.org) with SVACE. CC: stable@vger.kernel.org # 4.9+ Signed-off-by: Artem Chernyshev Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 9 +++------ fs/btrfs/rcu-string.h | 6 +++++- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index bed74a3ff574..4fd6b61b06a4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2859,13 +2859,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); - if (dev->name) { - strncpy(di_args->path, btrfs_dev_name(dev), - sizeof(di_args->path) - 1); - di_args->path[sizeof(di_args->path) - 1] = 0; - } else { + if (dev->name) + strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); + else di_args->path[0] = '\0'; - } out: rcu_read_unlock(); diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h index 5c1a617eb25d..5c2b66d155ef 100644 --- a/fs/btrfs/rcu-string.h +++ b/fs/btrfs/rcu-string.h @@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask) (len * sizeof(char)), mask); if (!ret) return ret; - strncpy(ret->str, src, len); + /* Warn if the source got unexpectedly truncated. */ + if (WARN_ON(strscpy(ret->str, src, len) < 0)) { + kfree(ret); + return NULL; + } return ret; } From 3a8d1db341b93e65fa76051ab833b3640842b6eb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Nov 2022 10:23:23 +0000 Subject: [PATCH 244/249] btrfs: unify overwrite_item() and do_overwrite_item() After commit 193df6245704 ("btrfs: search for last logged dir index if it's not cached in the inode"), there are no more callers of do_overwrite_item(), except overwrite_item(). Originally both used to be the same function, but were split in commit 086dcbfa50d3 ("btrfs: insert items in batches when logging a directory when possible"), as there was the need to execute all logic of overwrite_item() but skip the tree search, since in the context of directory logging we already had a path with a leaf to copy data from. So unify them again as there is no more need to have them split. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 76 ++++++++++++++------------------------------- 1 file changed, 24 insertions(+), 52 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4387cd2ade97..bcc8717f51fd 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -365,11 +365,25 @@ static int process_one_buffer(struct btrfs_root *log, return ret; } -static int do_overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) { int ret; u32 item_size; @@ -386,22 +400,10 @@ static int do_overwrite_item(struct btrfs_trans_handle *trans, item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); - /* Our caller must have done a search for the key for us. */ - ASSERT(path->nodes[0] != NULL); - - /* - * And the slot must point to the exact key or the slot where the key - * should be at (the first item with a key greater than 'key') - */ - if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) { - struct btrfs_key found_key; - - btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - ret = btrfs_comp_cpu_keys(&found_key, key); - ASSERT(ret >= 0); - } else { - ret = 1; - } + /* Look for the key in the destination tree. */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + return ret; if (ret == 0) { char *src_copy; @@ -579,36 +581,6 @@ no_copy: return 0; } -/* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. - * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). - * - * If the key is already in the destination tree the existing item is - * overwritten. If the existing item isn't big enough, it is extended. - * If it is too large, it is truncated. - * - * If the key isn't in the destination yet, a new item is inserted. - */ -static int overwrite_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct extent_buffer *eb, int slot, - struct btrfs_key *key) -{ - int ret; - - /* Look for the key in the destination tree. */ - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - return do_overwrite_item(trans, root, path, eb, slot, key); -} - static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, struct fscrypt_str *name) { @@ -5406,7 +5378,7 @@ struct btrfs_dir_list { * has a size that doesn't match the sum of the lengths of all the logged * names - this is ok, not a problem, because at log replay time we set the * directory's i_size to the correct value (see replay_one_name() and - * do_overwrite_item()). + * overwrite_item()). */ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, struct btrfs_inode *start_inode, From 3eb423442483d454937cdc8853e58c399ffe5514 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Nov 2022 10:23:24 +0000 Subject: [PATCH 245/249] btrfs: remove outdated logic from overwrite_item() and add assertion As of commit 193df6245704 ("btrfs: search for last logged dir index if it's not cached in the inode"), the overwrite_item() function is always called for a root that is from a fs/subvolume tree. In other words, now it's only used during log replay to modify a fs/subvolume tree. Therefore we can remove the logic that checks if we are dealing with a log tree at overwrite_item(). So remove that logic, replacing it with an assertion and document that if we ever need to support a log root there, we will need to clone the leaf from the fs/subvolume tree and then release it before modifying the log tree, which is needed to avoid a potential deadlock, similar to the one recently fixed by a patch with the subject: "btrfs: do not modify log tree while holding a leaf from fs tree locked" Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index bcc8717f51fd..a3c43f0b1c95 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -391,11 +391,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; - int overwrite_root = 0; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; - if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) - overwrite_root = 1; + /* + * This is only used during log replay, so the root is always from a + * fs/subvolume tree. In case we ever need to support a log root, then + * we'll have to clone the leaf in the path, release the path and use + * the leaf before writing into the log tree. See the comments at + * copy_items() for more details. + */ + ASSERT(root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID); item_size = btrfs_item_size(eb, slot); src_ptr = btrfs_item_ptr_offset(eb, slot); @@ -548,8 +553,7 @@ insert: goto no_copy; } - if (overwrite_root && - S_ISDIR(btrfs_inode_mode(eb, src_item)) && + if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { save_old_i_size = 1; saved_i_size = btrfs_inode_size(path->nodes[0], From 1742e1c90c3da344f3bb9b1f1309b3f47482756a Mon Sep 17 00:00:00 2001 From: void0red Date: Wed, 23 Nov 2022 22:39:45 +0800 Subject: [PATCH 246/249] btrfs: fix extent map use-after-free when handling missing device in read_one_chunk Store the error code before freeing the extent_map. Though it's reference counted structure, in that function it's the first and last allocation so this would lead to a potential use-after-free. The error can happen eg. when chunk is stored on a missing device and the degraded mount option is missing. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216721 Reported-by: eriri <1527030098@qq.com> Fixes: adfb69af7d8c ("btrfs: add_missing_dev() should return the actual error") CC: stable@vger.kernel.org # 4.9+ Signed-off-by: void0red Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index acab20f2863d..aa25fa335d3e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6976,8 +6976,9 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->stripes[i].dev = handle_missing_device(fs_info, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { + ret = PTR_ERR(map->stripes[i].dev); free_extent_map(em); - return PTR_ERR(map->stripes[i].dev); + return ret; } } From 162d053e15fe985f754ef495a96eb3db970c43ed Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 28 Nov 2022 15:07:30 +0000 Subject: [PATCH 247/249] btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range If we get -ENOMEM while dropping file extent items in a given range, at btrfs_drop_extents(), due to failure to allocate memory when attempting to increment the reference count for an extent or drop the reference count, we handle it with a BUG_ON(). This is excessive, instead we can simply abort the transaction and return the error to the caller. In fact most callers of btrfs_drop_extents(), directly or indirectly, already abort the transaction if btrfs_drop_extents() returns any error. Also, we already have error paths at btrfs_drop_extents() that may return -ENOMEM and in those cases we abort the transaction, like for example anything that changes the b+tree may return -ENOMEM due to a failure to allocate a new extent buffer when COWing an existing extent buffer, such as a call to btrfs_duplicate_item() for example. So replace the BUG_ON() calls with proper logic to abort the transaction and return the error. Reported-by: syzbot+0b1fb6b0108c27419f9f@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/00000000000089773e05ee4b9cb4@google.com/ CC: stable@vger.kernel.org # 5.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 448b143a5cb2..91b00eb2440e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -380,7 +380,10 @@ next_slot: args->start - extent_offset, 0, false); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } } key.offset = args->start; } @@ -467,7 +470,10 @@ delete_extent_item: key.offset - extent_offset, 0, false); ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); /* -ENOMEM */ + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } args->bytes_found += extent_end - key.offset; } From a28135303a669917002f569aecebd5758263e4aa Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 28 Nov 2022 14:26:38 -0500 Subject: [PATCH 248/249] btrfs: sync some cleanups from progs into uapi/btrfs.h When syncing this code into btrfs-progs Dave noticed there's some things we were losing in the sync that are needed. This syncs those changes into the kernel, which include a few comments that weren't in the kernel, some whitespace changes, an attribute, and the cplusplus bit. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- include/uapi/linux/btrfs.h | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index 5655e89b962b..b4f0f9531119 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -19,8 +19,14 @@ #ifndef _UAPI_LINUX_BTRFS_H #define _UAPI_LINUX_BTRFS_H + +#ifdef __cplusplus +extern "C" { +#endif + #include #include +#include #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_VOL_NAME_MAX 255 @@ -333,6 +339,12 @@ struct btrfs_ioctl_feature_flags { */ struct btrfs_balance_args { __u64 profiles; + + /* + * usage filter + * BTRFS_BALANCE_ARGS_USAGE with a single value means '0..N' + * BTRFS_BALANCE_ARGS_USAGE_RANGE - range syntax, min..max + */ union { __u64 usage; struct { @@ -549,7 +561,7 @@ struct btrfs_ioctl_search_header { __u64 offset; __u32 type; __u32 len; -}; +} __attribute__ ((__may_alias__)); #define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key)) /* @@ -562,6 +574,10 @@ struct btrfs_ioctl_search_args { char buf[BTRFS_SEARCH_ARGS_BUFSIZE]; }; +/* + * Extended version of TREE_SEARCH ioctl that can return more than 4k of bytes. + * The allocated size of the buffer is set in buf_size. + */ struct btrfs_ioctl_search_args_v2 { struct btrfs_ioctl_search_key key; /* in/out - search parameters */ __u64 buf_size; /* in - size of buffer @@ -570,10 +586,11 @@ struct btrfs_ioctl_search_args_v2 { __u64 buf[]; /* out - found items */ }; +/* With a @src_length of zero, the range from @src_offset->EOF is cloned! */ struct btrfs_ioctl_clone_range_args { - __s64 src_fd; - __u64 src_offset, src_length; - __u64 dest_offset; + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; }; /* @@ -677,8 +694,11 @@ struct btrfs_ioctl_logical_ino_args { /* struct btrfs_data_container *inodes; out */ __u64 inodes; }; -/* Return every ref to the extent, not just those containing logical block. - * Requires logical == extent bytenr. */ + +/* + * Return every ref to the extent, not just those containing logical block. + * Requires logical == extent bytenr. + */ #define BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET (1ULL << 0) enum btrfs_dev_stat_values { @@ -1144,4 +1164,8 @@ enum btrfs_err_code { #define BTRFS_IOC_ENCODED_WRITE _IOW(BTRFS_IOCTL_MAGIC, 64, \ struct btrfs_ioctl_encoded_io_args) +#ifdef __cplusplus +} +#endif + #endif /* _UAPI_LINUX_BTRFS_H */ From b7af0635c87ff78d6bd523298ab7471f9ffd3ce5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 30 Nov 2022 15:51:20 +0000 Subject: [PATCH 249/249] btrfs: print transaction aborted messages with an error level Currently we print the transaction aborted message with a debug level, but a transaction abort is an exceptional event that indicates something went wrong and it's useful to have it printed with an error level as it helps analysing problems in a production environment, where debug level messages are typically not logged. For example reports from syzbot never include the transaction aborted message, since the log level on the test machines is above the debug level. So change the log level from debug to error. Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/messages.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 5ac410e313e4..190af1f698d9 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -197,13 +197,13 @@ do { \ &((trans)->fs_info->fs_state))) { \ first = true; \ if (WARN(abort_should_print_stack(errno), \ - KERN_DEBUG \ + KERN_ERR \ "BTRFS: Transaction aborted (error %d)\n", \ (errno))) { \ /* Stack trace printed. */ \ } else { \ - btrfs_debug((trans)->fs_info, \ - "Transaction aborted (error %d)", \ + btrfs_err((trans)->fs_info, \ + "Transaction aborted (error %d)", \ (errno)); \ } \ } \