for-6.1-rc6-tag
-----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmOBCJEACgkQxWXV+ddt WDu5Nw/+P59ARfAm/4HRId4iL6UKozSMc+blWLeP9KkjcytdAfek0oGe3gZ7NJVK 8VYa93yNneCTkNFLIEpqEduGQjN04dr0odRUXD/kIR8EEtjbgDrH9ZmL47An5wVH qE8ILlh2+DXk/QLTpjo8n4mm+MJDJYzfz/jVV9vl8ehMahjj1M0/KmO/vNvDbP2s owWU1FBjX7TV6kHa+SQGqd1HfXS1YUx203I4SDmPj8vSXtysvSOWClT3HO6i6O5S MSS3Me+rx9eMFMISNghL8I466+lPlGxK14DmLUE4l0kfoKyd4eHQw+ft76D6Twuz JqjegAGA1nzqDO0XDXb4WPjrPKG8r8Ven2eInF3kncku9GyeEafL+L+nmj7PHsE7 dixWo2TQ9z1Wm/n1NWlU02ZSLdbetUtYTvZczUhevtNzuYUtILihcFZO3+Cp7V4p R2WwJ5XXdfS8g8Q9kJCOuVd9fZ+3hQvEF1IwWCP9ZZfmIC6/4/uGGFB6TJu7HmZC trpQYn9l5aP9L9Uq8t+9j+XoDEzQW0tZGpiYI9ypAa5Q5xbw3Ez2JNTbF7YVqQE2 iFDwuuy/X1iNvifniQgdodKVQLK/PcNrlcNb/gPG6cGCWjlTj3SKT9SlrwAgSDZW pFWFb9NtN3ORjLeCiONo/ZGpZzM9/XQplub+4WuXQXGNJasRIoE= =Q4JA -----END PGP SIGNATURE----- Merge tag 'for-6.1-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux Pull btrfs fixes from David Sterba: - fix a regression in nowait + buffered write - in zoned mode fix endianness when comparing super block generation - locking and lockdep fixes: - fix potential sleeping under spinlock when setting qgroup limit - lockdep warning fixes when btrfs_path is freed after copy_to_user - do not modify log tree while holding a leaf from fs tree locked - fix freeing of sysfs files of static features on error - use kv.alloc for zone map allocation as a fallback to avoid warnings due to high order allocation - send, avoid unaligned encoded writes when attempting to clone range * tag 'for-6.1-rc6-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: sysfs: normalize the error handling branch in btrfs_init_sysfs() btrfs: do not modify log tree while holding a leaf from fs tree locked btrfs: use kvcalloc in btrfs_get_dev_zone_info btrfs: qgroup: fix sleep from invalid context bug in btrfs_qgroup_inherit() btrfs: send: avoid unaligned encoded writes when attempting to clone range btrfs: zoned: fix missing endianness conversion in sb_write_pointer btrfs: free btrfs_path before copying subvol info to userspace btrfs: free btrfs_path before copying fspath to userspace btrfs: free btrfs_path before copying inodes to userspace btrfs: free btrfs_path before copying root refs to userspace btrfs: fix assertion failure and blocking during nowait buffered write
This commit is contained in:
commit
3eaea0db25
|
@ -4663,7 +4663,12 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
|
|||
int ret;
|
||||
int i;
|
||||
|
||||
ASSERT(!path->nowait);
|
||||
/*
|
||||
* The nowait semantics are used only for write paths, where we don't
|
||||
* use the tree mod log and sequence numbers.
|
||||
*/
|
||||
if (time_seq)
|
||||
ASSERT(!path->nowait);
|
||||
|
||||
nritems = btrfs_header_nritems(path->nodes[0]);
|
||||
if (nritems == 0)
|
||||
|
@ -4683,7 +4688,14 @@ again:
|
|||
if (path->need_commit_sem) {
|
||||
path->need_commit_sem = 0;
|
||||
need_commit_sem = true;
|
||||
down_read(&fs_info->commit_root_sem);
|
||||
if (path->nowait) {
|
||||
if (!down_read_trylock(&fs_info->commit_root_sem)) {
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
}
|
||||
} else {
|
||||
down_read(&fs_info->commit_root_sem);
|
||||
}
|
||||
}
|
||||
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
||||
}
|
||||
|
@ -4759,7 +4771,7 @@ again:
|
|||
next = c;
|
||||
ret = read_block_for_search(root, path, &next, level,
|
||||
slot, &key);
|
||||
if (ret == -EAGAIN)
|
||||
if (ret == -EAGAIN && !path->nowait)
|
||||
goto again;
|
||||
|
||||
if (ret < 0) {
|
||||
|
@ -4769,6 +4781,10 @@ again:
|
|||
|
||||
if (!path->skip_locking) {
|
||||
ret = btrfs_try_tree_read_lock(next);
|
||||
if (!ret && path->nowait) {
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
}
|
||||
if (!ret && time_seq) {
|
||||
/*
|
||||
* If we don't get the lock, we may be racing
|
||||
|
@ -4799,7 +4815,7 @@ again:
|
|||
|
||||
ret = read_block_for_search(root, path, &next, level,
|
||||
0, &key);
|
||||
if (ret == -EAGAIN)
|
||||
if (ret == -EAGAIN && !path->nowait)
|
||||
goto again;
|
||||
|
||||
if (ret < 0) {
|
||||
|
@ -4807,8 +4823,16 @@ again:
|
|||
goto done;
|
||||
}
|
||||
|
||||
if (!path->skip_locking)
|
||||
btrfs_tree_read_lock(next);
|
||||
if (!path->skip_locking) {
|
||||
if (path->nowait) {
|
||||
if (!btrfs_try_tree_read_lock(next)) {
|
||||
ret = -EAGAIN;
|
||||
goto done;
|
||||
}
|
||||
} else {
|
||||
btrfs_tree_read_lock(next);
|
||||
}
|
||||
}
|
||||
}
|
||||
ret = 0;
|
||||
done:
|
||||
|
|
|
@ -3105,6 +3105,8 @@ static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
|
|||
}
|
||||
}
|
||||
|
||||
btrfs_free_path(path);
|
||||
path = NULL;
|
||||
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
|
||||
ret = -EFAULT;
|
||||
|
||||
|
@ -3194,6 +3196,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
|
|||
}
|
||||
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
|
||||
if (!ret || ret == -EOVERFLOW) {
|
||||
rootrefs->num_items = found;
|
||||
/* update min_treeid for next search */
|
||||
|
@ -3205,7 +3209,6 @@ out:
|
|||
}
|
||||
|
||||
kfree(rootrefs);
|
||||
btrfs_free_path(path);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
@ -4231,6 +4234,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
|
|||
ipath->fspath->val[i] = rel_ptr;
|
||||
}
|
||||
|
||||
btrfs_free_path(path);
|
||||
path = NULL;
|
||||
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
|
||||
ipath->fspath, size);
|
||||
if (ret) {
|
||||
|
@ -4281,21 +4286,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
|
|||
size = min_t(u32, loi->size, SZ_16M);
|
||||
}
|
||||
|
||||
inodes = init_data_container(size);
|
||||
if (IS_ERR(inodes)) {
|
||||
ret = PTR_ERR(inodes);
|
||||
goto out_loi;
|
||||
}
|
||||
|
||||
path = btrfs_alloc_path();
|
||||
if (!path) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
inodes = init_data_container(size);
|
||||
if (IS_ERR(inodes)) {
|
||||
ret = PTR_ERR(inodes);
|
||||
inodes = NULL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
|
||||
inodes, ignore_offset);
|
||||
btrfs_free_path(path);
|
||||
if (ret == -EINVAL)
|
||||
ret = -ENOENT;
|
||||
if (ret < 0)
|
||||
|
@ -4307,7 +4311,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
|
|||
ret = -EFAULT;
|
||||
|
||||
out:
|
||||
btrfs_free_path(path);
|
||||
kvfree(inodes);
|
||||
out_loi:
|
||||
kfree(loi);
|
||||
|
|
|
@ -2951,14 +2951,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
|
|||
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
|
||||
dstgroup->rsv_excl = inherit->lim.rsv_excl;
|
||||
|
||||
ret = update_qgroup_limit_item(trans, dstgroup);
|
||||
if (ret) {
|
||||
qgroup_mark_inconsistent(fs_info);
|
||||
btrfs_info(fs_info,
|
||||
"unable to update quota limit for %llu",
|
||||
dstgroup->qgroupid);
|
||||
goto unlock;
|
||||
}
|
||||
qgroup_dirty(fs_info, dstgroup);
|
||||
}
|
||||
|
||||
if (srcid) {
|
||||
|
|
|
@ -5702,6 +5702,7 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
|
|||
u64 ext_len;
|
||||
u64 clone_len;
|
||||
u64 clone_data_offset;
|
||||
bool crossed_src_i_size = false;
|
||||
|
||||
if (slot >= btrfs_header_nritems(leaf)) {
|
||||
ret = btrfs_next_leaf(clone_root->root, path);
|
||||
|
@ -5759,8 +5760,10 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
|
|||
if (key.offset >= clone_src_i_size)
|
||||
break;
|
||||
|
||||
if (key.offset + ext_len > clone_src_i_size)
|
||||
if (key.offset + ext_len > clone_src_i_size) {
|
||||
ext_len = clone_src_i_size - key.offset;
|
||||
crossed_src_i_size = true;
|
||||
}
|
||||
|
||||
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
|
||||
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
|
||||
|
@ -5821,6 +5824,25 @@ static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
|
|||
ret = send_clone(sctx, offset, clone_len,
|
||||
clone_root);
|
||||
}
|
||||
} else if (crossed_src_i_size && clone_len < len) {
|
||||
/*
|
||||
* If we are at i_size of the clone source inode and we
|
||||
* can not clone from it, terminate the loop. This is
|
||||
* to avoid sending two write operations, one with a
|
||||
* length matching clone_len and the final one after
|
||||
* this loop with a length of len - clone_len.
|
||||
*
|
||||
* When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
|
||||
* was passed to the send ioctl), this helps avoid
|
||||
* sending an encoded write for an offset that is not
|
||||
* sector size aligned, in case the i_size of the source
|
||||
* inode is not sector size aligned. That will make the
|
||||
* receiver fallback to decompression of the data and
|
||||
* writing it using regular buffered IO, therefore while
|
||||
* not incorrect, it's not optimal due decompression and
|
||||
* possible re-compression at the receiver.
|
||||
*/
|
||||
break;
|
||||
} else {
|
||||
ret = send_extent_data(sctx, dst_path, offset,
|
||||
clone_len);
|
||||
|
|
|
@ -2321,8 +2321,11 @@ int __init btrfs_init_sysfs(void)
|
|||
|
||||
#ifdef CONFIG_BTRFS_DEBUG
|
||||
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
|
||||
if (ret)
|
||||
goto out2;
|
||||
if (ret) {
|
||||
sysfs_unmerge_group(&btrfs_kset->kobj,
|
||||
&btrfs_static_feature_attr_group);
|
||||
goto out_remove_group;
|
||||
}
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -3694,15 +3694,29 @@ static int process_dir_items_leaf(struct btrfs_trans_handle *trans,
|
|||
u64 *last_old_dentry_offset)
|
||||
{
|
||||
struct btrfs_root *log = inode->root->log_root;
|
||||
struct extent_buffer *src = path->nodes[0];
|
||||
const int nritems = btrfs_header_nritems(src);
|
||||
struct extent_buffer *src;
|
||||
const int nritems = btrfs_header_nritems(path->nodes[0]);
|
||||
const u64 ino = btrfs_ino(inode);
|
||||
bool last_found = false;
|
||||
int batch_start = 0;
|
||||
int batch_size = 0;
|
||||
int i;
|
||||
|
||||
for (i = path->slots[0]; i < nritems; i++) {
|
||||
/*
|
||||
* We need to clone the leaf, release the read lock on it, and use the
|
||||
* clone before modifying the log tree. See the comment at copy_items()
|
||||
* about why we need to do this.
|
||||
*/
|
||||
src = btrfs_clone_extent_buffer(path->nodes[0]);
|
||||
if (!src)
|
||||
return -ENOMEM;
|
||||
|
||||
i = path->slots[0];
|
||||
btrfs_release_path(path);
|
||||
path->nodes[0] = src;
|
||||
path->slots[0] = i;
|
||||
|
||||
for (; i < nritems; i++) {
|
||||
struct btrfs_dir_item *di;
|
||||
struct btrfs_key key;
|
||||
int ret;
|
||||
|
@ -4303,7 +4317,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
|
|||
{
|
||||
struct btrfs_root *log = inode->root->log_root;
|
||||
struct btrfs_file_extent_item *extent;
|
||||
struct extent_buffer *src = src_path->nodes[0];
|
||||
struct extent_buffer *src;
|
||||
int ret = 0;
|
||||
struct btrfs_key *ins_keys;
|
||||
u32 *ins_sizes;
|
||||
|
@ -4314,6 +4328,43 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
|
|||
const bool skip_csum = (inode->flags & BTRFS_INODE_NODATASUM);
|
||||
const u64 i_size = i_size_read(&inode->vfs_inode);
|
||||
|
||||
/*
|
||||
* To keep lockdep happy and avoid deadlocks, clone the source leaf and
|
||||
* use the clone. This is because otherwise we would be changing the log
|
||||
* tree, to insert items from the subvolume tree or insert csum items,
|
||||
* while holding a read lock on a leaf from the subvolume tree, which
|
||||
* creates a nasty lock dependency when COWing log tree nodes/leaves:
|
||||
*
|
||||
* 1) Modifying the log tree triggers an extent buffer allocation while
|
||||
* holding a write lock on a parent extent buffer from the log tree.
|
||||
* Allocating the pages for an extent buffer, or the extent buffer
|
||||
* struct, can trigger inode eviction and finally the inode eviction
|
||||
* will trigger a release/remove of a delayed node, which requires
|
||||
* taking the delayed node's mutex;
|
||||
*
|
||||
* 2) Allocating a metadata extent for a log tree can trigger the async
|
||||
* reclaim thread and make us wait for it to release enough space and
|
||||
* unblock our reservation ticket. The reclaim thread can start
|
||||
* flushing delayed items, and that in turn results in the need to
|
||||
* lock delayed node mutexes and in the need to write lock extent
|
||||
* buffers of a subvolume tree - all this while holding a write lock
|
||||
* on the parent extent buffer in the log tree.
|
||||
*
|
||||
* So one task in scenario 1) running in parallel with another task in
|
||||
* scenario 2) could lead to a deadlock, one wanting to lock a delayed
|
||||
* node mutex while having a read lock on a leaf from the subvolume,
|
||||
* while the other is holding the delayed node's mutex and wants to
|
||||
* write lock the same subvolume leaf for flushing delayed items.
|
||||
*/
|
||||
src = btrfs_clone_extent_buffer(src_path->nodes[0]);
|
||||
if (!src)
|
||||
return -ENOMEM;
|
||||
|
||||
i = src_path->slots[0];
|
||||
btrfs_release_path(src_path);
|
||||
src_path->nodes[0] = src;
|
||||
src_path->slots[0] = i;
|
||||
|
||||
ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
|
||||
nr * sizeof(u32), GFP_NOFS);
|
||||
if (!ins_data)
|
||||
|
|
|
@ -134,7 +134,8 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
|
|||
super[i] = page_address(page[i]);
|
||||
}
|
||||
|
||||
if (super[0]->generation > super[1]->generation)
|
||||
if (btrfs_super_generation(super[0]) >
|
||||
btrfs_super_generation(super[1]))
|
||||
sector = zones[1].start;
|
||||
else
|
||||
sector = zones[0].start;
|
||||
|
@ -466,7 +467,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
|
|||
goto out;
|
||||
}
|
||||
|
||||
zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
|
||||
zones = kvcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
|
||||
if (!zones) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
|
@ -585,7 +586,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
|
|||
}
|
||||
|
||||
|
||||
kfree(zones);
|
||||
kvfree(zones);
|
||||
|
||||
switch (bdev_zoned_model(bdev)) {
|
||||
case BLK_ZONED_HM:
|
||||
|
@ -617,7 +618,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
|
|||
return 0;
|
||||
|
||||
out:
|
||||
kfree(zones);
|
||||
kvfree(zones);
|
||||
out_free_zone_info:
|
||||
btrfs_destroy_dev_zone_info(device);
|
||||
|
||||
|
|
Loading…
Reference in New Issue