for-6.3-rc4-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmQptV0ACgkQxWXV+ddt
 WDuZ/g/8CAu7WKhj/aLsYB/xRcOcloeoUZXMhb6NUxZC14ZHrSc9rWMPF7S8T4qK
 PwoNfhROdox+laAYX2WcOgo6yZ4Rhd+yDdyqLgQIbc0q3cWfOJ/vzSkeREdNCvNW
 qTicdB59Mka0YT+BOC9em29bsxHLpEMKmg1o5tao8LCdc17jPFyPN6BYgxFfeenQ
 aetKUyosqllEBxlpJHaLG1+gKZrI2VaCyhrCEw66Mbtri5WbwN3cTJOXqNSkySDB
 JKEs3y4yMo3Xiz+UhCaq614EzX1SR15n/WP7ZvjxvlXXJ0iHp4f11zSlUnm2u+jI
 JN5lkfBorSRMowgnLWGDn5zQDKXJOk1aAWv5YgqTqpWKg6X/fHxTdt4wdCSZ08m9
 dwVWqWN2BD7jS0UT45IPsniwGI9bkLRcNUFNgbFtRD9X52U2ie/PSv9qdz9gsDLW
 5FSXv65gD+kWdkpyw7NLRtXO1FPe6wfPm5ZqecEChIQmWUiisOnJwjKlewQUdRsy
 zki4wRGxiqKgSlrxrCLs24r9291EwjR9FcBTZLrYRNbCBf32xIGG2CUhPBapx4kB
 xgMHCn5NdP/cHPxqzQNeq8z8NI4F648qr6Z2KS03rmWZv9/1xsB39NFS4qLjrOM7
 YqpNDtCGVG5HpMWzardbcZ2FdoKj+o1qCCW851y8tDCdimPhSfk=
 =v7ZW
 -----END PGP SIGNATURE-----

Merge tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:

 - scan block devices in non-exclusive mode to avoid temporary mkfs
   failures

 - fix race between quota disable and quota assign ioctls

 - fix deadlock when aborting transaction during relocation with scrub

 - ignore fiemap path cache when there are multiple paths for a node

* tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: ignore fiemap path cache when there are multiple paths for a node
  btrfs: fix deadlock when aborting transaction during relocation with scrub
  btrfs: scan device in non-exclusive mode
  btrfs: fix race between quota disable and quota assign ioctls
This commit is contained in:
Linus Torvalds 2023-04-02 10:57:12 -07:00
commit 6ab608fe85
5 changed files with 107 additions and 26 deletions

View File

@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
level = -1;
ULIST_ITER_INIT(&uiter);
while (1) {
bool is_shared;
bool cached;
const unsigned long prev_ref_count = ctx->refs.nnodes;
walk_ctx.bytenr = bytenr;
ret = find_parent_nodes(&walk_ctx, &shared);
@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
ret = 0;
/*
* If our data extent was not directly shared (without multiple
* reference items), than it might have a single reference item
* with a count > 1 for the same offset, which means there are 2
* (or more) file extent items that point to the data extent -
* this happens when a file extent item needs to be split and
* then one item gets moved to another leaf due to a b+tree leaf
* split when inserting some item. In this case the file extent
* items may be located in different leaves and therefore some
* of the leaves may be referenced through shared subtrees while
* others are not. Since our extent buffer cache only works for
* a single path (by far the most common case and simpler to
* deal with), we can not use it if we have multiple leaves
* (which implies multiple paths).
* More than one extent buffer (bytenr) may have been added to
* the ctx->refs ulist, in which case we have to check multiple
* tree paths in case the first one is not shared, so we can not
* use the path cache which is made for a single path. Multiple
* extent buffers at the current level happen when:
*
* 1) level -1, the data extent: If our data extent was not
* directly shared (without multiple reference items), then
* it might have a single reference item with a count > 1 for
* the same offset, which means there are 2 (or more) file
* extent items that point to the data extent - this happens
* when a file extent item needs to be split and then one
* item gets moved to another leaf due to a b+tree leaf split
* when inserting some item. In this case the file extent
* items may be located in different leaves and therefore
* some of the leaves may be referenced through shared
* subtrees while others are not. Since our extent buffer
* cache only works for a single path (by far the most common
* case and simpler to deal with), we can not use it if we
* have multiple leaves (which implies multiple paths).
*
* 2) level >= 0, a tree node/leaf: We can have a mix of direct
* and indirect references on a b+tree node/leaf, so we have
* to check multiple paths, and the extent buffer (the
* current bytenr) may be shared or not. One example is
* during relocation as we may get a shared tree block ref
* (direct ref) and a non-shared tree block ref (indirect
* ref) for the same node/leaf.
*/
if (level == -1 && ctx->refs.nnodes > 1)
if ((ctx->refs.nnodes - prev_ref_count) > 1)
ctx->use_path_cache = false;
if (level >= 0)
@ -1964,18 +1978,45 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
if (!node)
break;
bytenr = node->val;
level++;
cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
&is_shared);
if (cached) {
ret = (is_shared ? 1 : 0);
break;
if (ctx->use_path_cache) {
bool is_shared;
bool cached;
level++;
cached = lookup_backref_shared_cache(ctx, root, bytenr,
level, &is_shared);
if (cached) {
ret = (is_shared ? 1 : 0);
break;
}
}
shared.share_count = 0;
shared.have_delayed_delete_refs = false;
cond_resched();
}
/*
* If the path cache is disabled, then it means at some tree level we
* got multiple parents due to a mix of direct and indirect backrefs or
* multiple leaves with file extent items pointing to the same data
* extent. We have to invalidate the cache and cache only the sharedness
* result for the levels where we got only one node/reference.
*/
if (!ctx->use_path_cache) {
int i = 0;
level--;
if (ret >= 0 && level >= 0) {
bytenr = ctx->path_cache_entries[level].bytenr;
ctx->use_path_cache = true;
store_backref_shared_cache(ctx, root, bytenr, level, ret);
i = level + 1;
}
for ( ; i < BTRFS_MAX_LEVEL; i++)
ctx->path_cache_entries[i].bytenr = 0;
}
/*
* Cache the sharedness result for the data extent if we know our inode
* has more than 1 file extent item that refers to the data extent.

View File

@ -3732,7 +3732,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
}
/* update qgroup status and info */
mutex_lock(&fs_info->qgroup_ioctl_lock);
err = btrfs_run_qgroups(trans);
mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
btrfs_handle_fs_error(fs_info, err,
"failed to update qgroup status and info");

View File

@ -2828,13 +2828,22 @@ cleanup:
}
/*
* called from commit_transaction. Writes all changed qgroups to disk.
* Writes all changed qgroups to disk.
* Called by the transaction commit path and the qgroup assign ioctl.
*/
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret = 0;
/*
* In case we are called from the qgroup assign ioctl, assert that we
* are holding the qgroup_ioctl_lock, otherwise we can race with a quota
* disable operation (ioctl) and access a freed quota root.
*/
if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
return ret;

View File

@ -2035,7 +2035,20 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
if (current->journal_info == trans)
current->journal_info = NULL;
btrfs_scrub_cancel(fs_info);
/*
* If relocation is running, we can't cancel scrub because that will
* result in a deadlock. Before relocating a block group, relocation
* pauses scrub, then starts and commits a transaction before unpausing
* scrub. If the transaction commit is being done by the relocation
* task or triggered by another task and the relocation task is waiting
* for the commit, and we end up here due to an error in the commit
* path, then calling btrfs_scrub_cancel() will deadlock, as we are
* asking for scrub to stop while having it asked to be paused higher
* above in relocation code.
*/
if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
btrfs_scrub_cancel(fs_info);
kmem_cache_free(btrfs_trans_handle_cachep, trans);
}

View File

@ -1366,8 +1366,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* So, we need to add a special mount option to scan for
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
flags |= FMODE_EXCL;
/*
* Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
* initiate the device scan which may race with the user's mount
* or mkfs command, resulting in failure.
* Since the device scan is solely for reading purposes, there is
* no need for FMODE_EXCL. Additionally, the devices are read again
* during the mount process. It is ok to get some inconsistent
* values temporarily, as the device paths of the fsid are the only
* required information for assembling the volume.
*/
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
@ -3266,8 +3275,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
btrfs_scrub_pause(fs_info);
ret = btrfs_relocate_block_group(fs_info, chunk_offset);
btrfs_scrub_continue(fs_info);
if (ret)
if (ret) {
/*
* If we had a transaction abort, stop all running scrubs.
* See transaction.c:cleanup_transaction() why we do it here.
*/
if (BTRFS_FS_ERROR(fs_info))
btrfs_scrub_cancel(fs_info);
return ret;
}
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!block_group)