2018-04-04 01:23:33 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-11-06 00:33:06 +08:00
|
|
|
/*
|
|
|
|
* Copyright (C) STRATO AG 2012. All rights reserved.
|
|
|
|
*/
|
2018-04-04 01:23:33 +08:00
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
#include <linux/sched.h>
|
|
|
|
#include <linux/bio.h>
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/kthread.h>
|
|
|
|
#include <linux/math64.h>
|
2019-08-22 00:48:25 +08:00
|
|
|
#include "misc.h"
|
2012-11-06 00:33:06 +08:00
|
|
|
#include "ctree.h"
|
|
|
|
#include "extent_map.h"
|
|
|
|
#include "disk-io.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
#include "print-tree.h"
|
|
|
|
#include "volumes.h"
|
|
|
|
#include "async-thread.h"
|
|
|
|
#include "check-integrity.h"
|
|
|
|
#include "rcu-string.h"
|
|
|
|
#include "dev-replace.h"
|
2014-06-03 11:36:02 +08:00
|
|
|
#include "sysfs.h"
|
2020-11-10 19:26:07 +08:00
|
|
|
#include "zoned.h"
|
2021-02-04 18:22:11 +08:00
|
|
|
#include "block-group.h"
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2020-01-23 15:44:50 +08:00
|
|
|
/*
|
|
|
|
* Device replace overview
|
|
|
|
*
|
|
|
|
* [Objective]
|
|
|
|
* To copy all extents (both new and on-disk) from source device to target
|
|
|
|
* device, while still keeping the filesystem read-write.
|
|
|
|
*
|
|
|
|
* [Method]
|
|
|
|
* There are two main methods involved:
|
|
|
|
*
|
|
|
|
* - Write duplication
|
|
|
|
*
|
|
|
|
* All new writes will be written to both target and source devices, so even
|
2021-05-21 23:42:23 +08:00
|
|
|
* if replace gets canceled, sources device still contains up-to-date data.
|
2020-01-23 15:44:50 +08:00
|
|
|
*
|
|
|
|
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
|
|
|
|
* Start: btrfs_dev_replace_start()
|
|
|
|
* End: btrfs_dev_replace_finishing()
|
|
|
|
* Content: Latest data/metadata
|
|
|
|
*
|
|
|
|
* - Copy existing extents
|
|
|
|
*
|
|
|
|
* This happens by re-using scrub facility, as scrub also iterates through
|
|
|
|
* existing extents from commit root.
|
|
|
|
*
|
|
|
|
* Location: scrub_write_block_to_dev_replace() from
|
|
|
|
* scrub_block_complete()
|
|
|
|
* Content: Data/meta from commit root.
|
|
|
|
*
|
|
|
|
* Due to the content difference, we need to avoid nocow write when dev-replace
|
|
|
|
* is happening. This is done by marking the block group read-only and waiting
|
|
|
|
* for NOCOW writes.
|
|
|
|
*
|
|
|
|
* After replace is done, the finishing part is done by swapping the target and
|
|
|
|
* source devices.
|
|
|
|
*
|
|
|
|
* Location: btrfs_dev_replace_update_device_in_mapping_tree() from
|
|
|
|
* btrfs_dev_replace_finishing()
|
|
|
|
*/
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
|
|
|
|
int scrub_ret);
|
|
|
|
static int btrfs_dev_replace_kthread(void *data);
|
|
|
|
|
|
|
|
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_root *dev_root = fs_info->dev_root;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
int slot;
|
|
|
|
int ret = 0;
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
int item_size;
|
|
|
|
struct btrfs_dev_replace_item *ptr;
|
|
|
|
u64 src_devid;
|
|
|
|
|
2021-03-12 00:23:16 +08:00
|
|
|
if (!dev_root)
|
|
|
|
return 0;
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_DEV_REPLACE_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
|
|
|
|
if (ret) {
|
|
|
|
no_valid_dev_replace_entry_found:
|
2020-10-30 06:53:56 +08:00
|
|
|
/*
|
|
|
|
* We don't have a replace item or it's corrupted. If there is
|
|
|
|
* a replace target, fail the mount.
|
|
|
|
*/
|
|
|
|
if (btrfs_find_device(fs_info->fs_devices,
|
2020-11-03 13:49:43 +08:00
|
|
|
BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
|
2020-10-30 06:53:56 +08:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"found replace target device without a valid replace item");
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-11-06 00:33:06 +08:00
|
|
|
ret = 0;
|
|
|
|
dev_replace->replace_state =
|
2019-08-08 12:32:44 +08:00
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->cont_reading_from_srcdev_mode =
|
|
|
|
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
|
|
|
|
dev_replace->time_started = 0;
|
|
|
|
dev_replace->time_stopped = 0;
|
|
|
|
atomic64_set(&dev_replace->num_write_errors, 0);
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
|
|
|
|
dev_replace->cursor_left = 0;
|
|
|
|
dev_replace->committed_cursor_left = 0;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = 0;
|
|
|
|
dev_replace->cursor_right = 0;
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->is_valid = 0;
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
slot = path->slots[0];
|
|
|
|
eb = path->nodes[0];
|
|
|
|
item_size = btrfs_item_size_nr(eb, slot);
|
|
|
|
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
|
|
|
|
|
|
|
|
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
|
2013-12-21 00:37:06 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"dev_replace entry found has unexpected size, ignore entry");
|
2012-11-06 00:33:06 +08:00
|
|
|
goto no_valid_dev_replace_entry_found;
|
|
|
|
}
|
|
|
|
|
|
|
|
src_devid = btrfs_dev_replace_src_devid(eb, ptr);
|
|
|
|
dev_replace->cont_reading_from_srcdev_mode =
|
|
|
|
btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
|
|
|
|
dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
|
|
|
|
dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
|
|
|
|
dev_replace->time_stopped =
|
|
|
|
btrfs_dev_replace_time_stopped(eb, ptr);
|
|
|
|
atomic64_set(&dev_replace->num_write_errors,
|
|
|
|
btrfs_dev_replace_num_write_errors(eb, ptr));
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors,
|
|
|
|
btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
|
|
|
|
dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
|
|
|
|
dev_replace->committed_cursor_left = dev_replace->cursor_left;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
|
|
|
|
dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
|
|
|
|
dev_replace->is_valid = 1;
|
|
|
|
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
2020-10-30 06:53:56 +08:00
|
|
|
/*
|
|
|
|
* We don't have an active replace item but if there is a
|
|
|
|
* replace target, fail the mount.
|
|
|
|
*/
|
|
|
|
if (btrfs_find_device(fs_info->fs_devices,
|
2020-11-03 13:49:43 +08:00
|
|
|
BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
|
2020-10-30 06:53:56 +08:00
|
|
|
btrfs_err(fs_info,
|
|
|
|
"replace devid present without an active replace item");
|
|
|
|
ret = -EUCLEAN;
|
|
|
|
} else {
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
}
|
2012-11-06 00:33:06 +08:00
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2019-01-17 23:32:31 +08:00
|
|
|
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
|
2020-11-03 13:49:43 +08:00
|
|
|
src_devid, NULL, NULL);
|
2019-01-17 23:32:31 +08:00
|
|
|
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
|
2012-11-06 00:33:06 +08:00
|
|
|
BTRFS_DEV_REPLACE_DEVID,
|
2020-11-03 13:49:43 +08:00
|
|
|
NULL, NULL);
|
2012-11-06 00:33:06 +08:00
|
|
|
/*
|
|
|
|
* allow 'btrfs dev replace_cancel' if src/tgt device is
|
|
|
|
* missing
|
|
|
|
*/
|
|
|
|
if (!dev_replace->srcdev &&
|
2016-06-23 06:54:23 +08:00
|
|
|
!btrfs_test_opt(fs_info, DEGRADED)) {
|
2012-11-06 00:33:06 +08:00
|
|
|
ret = -EIO;
|
2013-12-21 00:37:06 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"cannot mount because device replace operation is ongoing and");
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
|
|
|
|
src_devid);
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
if (!dev_replace->tgtdev &&
|
2016-06-23 06:54:23 +08:00
|
|
|
!btrfs_test_opt(fs_info, DEGRADED)) {
|
2012-11-06 00:33:06 +08:00
|
|
|
ret = -EIO;
|
2013-12-21 00:37:06 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"cannot mount because device replace operation is ongoing and");
|
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
|
2013-08-20 19:20:08 +08:00
|
|
|
BTRFS_DEV_REPLACE_DEVID);
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
if (dev_replace->tgtdev) {
|
|
|
|
if (dev_replace->srcdev) {
|
|
|
|
dev_replace->tgtdev->total_bytes =
|
|
|
|
dev_replace->srcdev->total_bytes;
|
|
|
|
dev_replace->tgtdev->disk_total_bytes =
|
|
|
|
dev_replace->srcdev->disk_total_bytes;
|
2014-09-03 21:35:33 +08:00
|
|
|
dev_replace->tgtdev->commit_total_bytes =
|
|
|
|
dev_replace->srcdev->commit_total_bytes;
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->tgtdev->bytes_used =
|
|
|
|
dev_replace->srcdev->bytes_used;
|
2014-09-03 21:35:34 +08:00
|
|
|
dev_replace->tgtdev->commit_bytes_used =
|
|
|
|
dev_replace->srcdev->commit_bytes_used;
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
2017-12-04 12:54:55 +08:00
|
|
|
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
|
|
|
|
&dev_replace->tgtdev->dev_state);
|
2018-02-12 23:36:25 +08:00
|
|
|
|
|
|
|
WARN_ON(fs_info->fs_devices->rw_devices == 0);
|
|
|
|
dev_replace->tgtdev->io_width = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->io_align = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->sector_size = fs_info->sectorsize;
|
|
|
|
dev_replace->tgtdev->fs_info = fs_info;
|
|
|
|
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
|
|
|
|
&dev_replace->tgtdev->dev_state);
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
2015-08-19 13:55:00 +08:00
|
|
|
btrfs_free_path(path);
|
2012-11-06 00:33:06 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-03-20 23:09:48 +08:00
|
|
|
/*
|
|
|
|
* Initialize a new device for device replace target from a given source dev
|
|
|
|
* and path.
|
|
|
|
*
|
|
|
|
* Return 0 and new device in @device_out, otherwise return < 0
|
|
|
|
*/
|
|
|
|
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
|
|
|
|
const char *device_path,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device **device_out)
|
|
|
|
{
|
|
|
|
struct btrfs_device *device;
|
|
|
|
struct block_device *bdev;
|
|
|
|
struct rcu_string *name;
|
|
|
|
u64 devid = BTRFS_DEV_REPLACE_DEVID;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
*device_out = NULL;
|
btrfs: fix replace of seed device
If you replace a seed device in a sprouted fs, it appears to have
successfully replaced the seed device, but if you look closely, it
didn't. Here is an example.
$ mkfs.btrfs /dev/sda
$ btrfstune -S1 /dev/sda
$ mount /dev/sda /btrfs
$ btrfs device add /dev/sdb /btrfs
$ umount /btrfs
$ btrfs device scan --forget
$ mount -o device=/dev/sda /dev/sdb /btrfs
$ btrfs replace start -f /dev/sda /dev/sdc /btrfs
$ echo $?
0
BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc started
BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc finished
$ btrfs fi show
Label: none uuid: ab2c88b7-be81-4a7e-9849-c3666e7f9f4f
Total devices 2 FS bytes used 256.00KiB
devid 1 size 3.00GiB used 520.00MiB path /dev/sdc
devid 2 size 3.00GiB used 896.00MiB path /dev/sdb
Label: none uuid: 10bd3202-0415-43af-96a8-d5409f310a7e
Total devices 1 FS bytes used 128.00KiB
devid 1 size 3.00GiB used 536.00MiB path /dev/sda
So as per the replace start command and kernel log replace was successful.
Now let's try to clean mount.
$ umount /btrfs
$ btrfs device scan --forget
$ mount -o device=/dev/sdc /dev/sdb /btrfs
mount: /btrfs: wrong fs type, bad option, bad superblock on /dev/sdb, missing codepage or helper program, or other error.
[ 636.157517] BTRFS error (device sdc): failed to read chunk tree: -2
[ 636.180177] BTRFS error (device sdc): open_ctree failed
That's because per dev items it is still looking for the original seed
device.
$ btrfs inspect-internal dump-tree -d /dev/sdb
item 0 key (DEV_ITEMS DEV_ITEM 1) itemoff 16185 itemsize 98
devid 1 total_bytes 3221225472 bytes_used 545259520
io_align 4096 io_width 4096 sector_size 4096 type 0
generation 6 start_offset 0 dev_group 0
seek_speed 0 bandwidth 0
uuid 59368f50-9af2-4b17-91da-8a783cc418d4 <--- seed uuid
fsid 10bd3202-0415-43af-96a8-d5409f310a7e <--- seed fsid
item 1 key (DEV_ITEMS DEV_ITEM 2) itemoff 16087 itemsize 98
devid 2 total_bytes 3221225472 bytes_used 939524096
io_align 4096 io_width 4096 sector_size 4096 type 0
generation 0 start_offset 0 dev_group 0
seek_speed 0 bandwidth 0
uuid 56a0a6bc-4630-4998-8daf-3c3030c4256a <- sprout uuid
fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f <- sprout fsid
But the replaced target has the following uuid+fsid in its superblock
which doesn't match with the expected uuid+fsid in its devitem.
$ btrfs in dump-super /dev/sdc | egrep '^generation|dev_item.uuid|dev_item.fsid|devid'
generation 20
dev_item.uuid 59368f50-9af2-4b17-91da-8a783cc418d4
dev_item.fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f [match]
dev_item.devid 1
So if you provide the original seed device the mount shall be
successful. Which so long happening in the test case btrfs/163.
$ btrfs device scan --forget
$ mount -o device=/dev/sda /dev/sdb /btrfs
Fix in this patch:
If a seed is not sprouted then there is no replacement of it, because of
its read-only filesystem with a read-only device. Similarly, in the case
of a sprouted filesystem, the seed device is still read only. So, mark
it as you can't replace a seed device, you can only add a new device and
then delete the seed device. If replace is attempted then returns
-EINVAL.
Signed-off-by: Anand Jain <anand.jain@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-05 01:34:22 +08:00
|
|
|
if (srcdev->fs_devices->seeding) {
|
2018-03-20 23:09:48 +08:00
|
|
|
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
|
|
|
|
fs_info->bdev_holder);
|
|
|
|
if (IS_ERR(bdev)) {
|
|
|
|
btrfs_err(fs_info, "target device %s is invalid!", device_path);
|
|
|
|
return PTR_ERR(bdev);
|
|
|
|
}
|
|
|
|
|
2020-11-10 19:26:08 +08:00
|
|
|
if (!btrfs_check_device_zone_type(fs_info, bdev)) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"dev-replace: zoned type of target device mismatch with filesystem");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
2019-05-14 18:54:38 +08:00
|
|
|
sync_blockdev(bdev);
|
2018-03-20 23:09:48 +08:00
|
|
|
|
2020-09-05 01:34:32 +08:00
|
|
|
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
|
2018-03-20 23:09:48 +08:00
|
|
|
if (device->bdev == bdev) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"target device is in the filesystem!");
|
|
|
|
ret = -EEXIST;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (i_size_read(bdev->bd_inode) <
|
|
|
|
btrfs_device_get_total_bytes(srcdev)) {
|
|
|
|
btrfs_err(fs_info,
|
|
|
|
"target device is smaller than source device!");
|
|
|
|
ret = -EINVAL;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
device = btrfs_alloc_device(NULL, &devid, NULL);
|
|
|
|
if (IS_ERR(device)) {
|
|
|
|
ret = PTR_ERR(device);
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
|
|
|
|
name = rcu_string_strdup(device_path, GFP_KERNEL);
|
|
|
|
if (!name) {
|
|
|
|
btrfs_free_device(device);
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto error;
|
|
|
|
}
|
|
|
|
rcu_assign_pointer(device->name, name);
|
|
|
|
|
|
|
|
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
|
|
|
|
device->generation = 0;
|
|
|
|
device->io_width = fs_info->sectorsize;
|
|
|
|
device->io_align = fs_info->sectorsize;
|
|
|
|
device->sector_size = fs_info->sectorsize;
|
|
|
|
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
|
|
|
|
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
|
|
|
|
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
|
|
|
|
device->commit_total_bytes = srcdev->commit_total_bytes;
|
|
|
|
device->commit_bytes_used = device->bytes_used;
|
|
|
|
device->fs_info = fs_info;
|
|
|
|
device->bdev = bdev;
|
|
|
|
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
|
|
|
|
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
|
|
|
|
device->mode = FMODE_EXCL;
|
|
|
|
device->dev_stats_valid = 1;
|
|
|
|
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
|
|
|
|
device->fs_devices = fs_info->fs_devices;
|
2019-05-14 18:54:39 +08:00
|
|
|
|
2020-11-10 19:26:07 +08:00
|
|
|
ret = btrfs_get_dev_zone_info(device);
|
|
|
|
if (ret)
|
|
|
|
goto error;
|
|
|
|
|
2019-05-14 18:54:39 +08:00
|
|
|
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
2018-03-20 23:09:48 +08:00
|
|
|
list_add(&device->dev_list, &fs_info->fs_devices->devices);
|
|
|
|
fs_info->fs_devices->num_devices++;
|
|
|
|
fs_info->fs_devices->open_devices++;
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
|
|
|
|
|
|
|
*device_out = device;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
error:
|
|
|
|
blkdev_put(bdev, FMODE_EXCL);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
/*
|
|
|
|
* called from commit_transaction. Writes changed device replace state to
|
|
|
|
* disk.
|
|
|
|
*/
|
2019-03-20 23:51:44 +08:00
|
|
|
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
|
2012-11-06 00:33:06 +08:00
|
|
|
{
|
2019-03-20 23:51:44 +08:00
|
|
|
struct btrfs_fs_info *fs_info = trans->fs_info;
|
2012-11-06 00:33:06 +08:00
|
|
|
int ret;
|
|
|
|
struct btrfs_root *dev_root = fs_info->dev_root;
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct extent_buffer *eb;
|
|
|
|
struct btrfs_dev_replace_item *ptr;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
if (!dev_replace->is_valid ||
|
|
|
|
!dev_replace->item_needs_writeback) {
|
2018-09-07 22:11:23 +08:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2018-09-07 22:11:23 +08:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
|
|
|
key.objectid = 0;
|
|
|
|
key.type = BTRFS_DEV_REPLACE_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
|
|
|
|
if (ret < 0) {
|
2016-09-20 22:05:00 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"error %d while searching for dev_replace item!",
|
|
|
|
ret);
|
2012-11-06 00:33:06 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 0 &&
|
|
|
|
btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
|
|
|
|
/*
|
|
|
|
* need to delete old one and insert a new one.
|
|
|
|
* Since no attempt is made to recover any old state, if the
|
|
|
|
* dev_replace state is 'running', the data on the target
|
|
|
|
* drive is lost.
|
|
|
|
* It would be possible to recover the state: just make sure
|
|
|
|
* that the beginning of the item is never changed and always
|
|
|
|
* contains all the essential information. Then read this
|
|
|
|
* minimal set of information and use it as a base for the
|
|
|
|
* new state.
|
|
|
|
*/
|
|
|
|
ret = btrfs_del_item(trans, dev_root, path);
|
|
|
|
if (ret != 0) {
|
2016-09-20 22:05:00 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"delete too small dev_replace item failed %d!",
|
|
|
|
ret);
|
2012-11-06 00:33:06 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (ret == 1) {
|
|
|
|
/* need to insert a new item */
|
|
|
|
btrfs_release_path(path);
|
|
|
|
ret = btrfs_insert_empty_item(trans, dev_root, path,
|
|
|
|
&key, sizeof(*ptr));
|
|
|
|
if (ret < 0) {
|
2016-09-20 22:05:00 +08:00
|
|
|
btrfs_warn(fs_info,
|
|
|
|
"insert dev_replace item failed %d!", ret);
|
2012-11-06 00:33:06 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
eb = path->nodes[0];
|
|
|
|
ptr = btrfs_item_ptr(eb, path->slots[0],
|
|
|
|
struct btrfs_dev_replace_item);
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
if (dev_replace->srcdev)
|
|
|
|
btrfs_set_dev_replace_src_devid(eb, ptr,
|
|
|
|
dev_replace->srcdev->devid);
|
|
|
|
else
|
|
|
|
btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
|
|
|
|
btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
|
|
|
|
dev_replace->cont_reading_from_srcdev_mode);
|
|
|
|
btrfs_set_dev_replace_replace_state(eb, ptr,
|
|
|
|
dev_replace->replace_state);
|
|
|
|
btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
|
|
|
|
btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
|
|
|
|
btrfs_set_dev_replace_num_write_errors(eb, ptr,
|
|
|
|
atomic64_read(&dev_replace->num_write_errors));
|
|
|
|
btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
|
|
|
|
atomic64_read(&dev_replace->num_uncorrectable_read_errors));
|
|
|
|
dev_replace->cursor_left_last_write_of_item =
|
|
|
|
dev_replace->cursor_left;
|
|
|
|
btrfs_set_dev_replace_cursor_left(eb, ptr,
|
|
|
|
dev_replace->cursor_left_last_write_of_item);
|
|
|
|
btrfs_set_dev_replace_cursor_right(eb, ptr,
|
|
|
|
dev_replace->cursor_right);
|
|
|
|
dev_replace->item_needs_writeback = 0;
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
|
|
|
btrfs_mark_buffer_dirty(eb);
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2017-11-28 10:43:10 +08:00
|
|
|
static char* btrfs_dev_name(struct btrfs_device *device)
|
|
|
|
{
|
2018-02-24 19:43:56 +08:00
|
|
|
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
|
2017-11-28 10:43:10 +08:00
|
|
|
return "<missing disk>";
|
|
|
|
else
|
|
|
|
return rcu_str_deref(device->name);
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:22:11 +08:00
|
|
|
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_device *src_dev)
|
|
|
|
{
|
|
|
|
struct btrfs_path *path;
|
|
|
|
struct btrfs_key key;
|
|
|
|
struct btrfs_key found_key;
|
|
|
|
struct btrfs_root *root = fs_info->dev_root;
|
|
|
|
struct btrfs_dev_extent *dev_extent = NULL;
|
|
|
|
struct btrfs_block_group *cache;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
|
|
|
u64 chunk_offset;
|
|
|
|
|
|
|
|
/* Do not use "to_copy" on non zoned filesystem for now */
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
/* Ensure we don't have pending new block group */
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
while (fs_info->running_transaction &&
|
|
|
|
!list_empty(&fs_info->running_transaction->dev_update_list)) {
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
trans = btrfs_attach_transaction(root);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
if (ret == -ENOENT) {
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
continue;
|
|
|
|
} else {
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
if (ret)
|
|
|
|
goto unlock;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->trans_lock);
|
|
|
|
}
|
|
|
|
spin_unlock(&fs_info->trans_lock);
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->reada = READA_FORWARD;
|
|
|
|
path->search_commit_root = 1;
|
|
|
|
path->skip_locking = 1;
|
|
|
|
|
|
|
|
key.objectid = src_dev->devid;
|
|
|
|
key.type = BTRFS_DEV_EXTENT_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto free_path;
|
|
|
|
if (ret > 0) {
|
|
|
|
if (path->slots[0] >=
|
|
|
|
btrfs_header_nritems(path->nodes[0])) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto free_path;
|
|
|
|
if (ret > 0) {
|
|
|
|
ret = 0;
|
|
|
|
goto free_path;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ret = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct extent_buffer *leaf = path->nodes[0];
|
|
|
|
int slot = path->slots[0];
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &found_key, slot);
|
|
|
|
|
|
|
|
if (found_key.objectid != src_dev->devid)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.type != BTRFS_DEV_EXTENT_KEY)
|
|
|
|
break;
|
|
|
|
|
|
|
|
if (found_key.offset < key.offset)
|
|
|
|
break;
|
|
|
|
|
|
|
|
dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
|
|
|
|
|
|
|
|
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
|
|
|
|
|
|
|
|
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
|
|
|
|
if (!cache)
|
|
|
|
goto skip;
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
cache->to_copy = 1;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
btrfs_put_block_group(cache);
|
|
|
|
|
|
|
|
skip:
|
|
|
|
ret = btrfs_next_item(root, path);
|
|
|
|
if (ret != 0) {
|
|
|
|
if (ret > 0)
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
free_path:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
unlock:
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_block_group *cache,
|
|
|
|
u64 physical)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = cache->fs_info;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct map_lookup *map;
|
|
|
|
u64 chunk_offset = cache->start;
|
|
|
|
int num_extents, cur_extent;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
/* Do not use "to_copy" on non zoned filesystem for now */
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
if (cache->removed) {
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
|
|
|
|
ASSERT(!IS_ERR(em));
|
|
|
|
map = em->map_lookup;
|
|
|
|
|
|
|
|
num_extents = cur_extent = 0;
|
|
|
|
for (i = 0; i < map->num_stripes; i++) {
|
|
|
|
/* We have more device extent to copy */
|
|
|
|
if (srcdev != map->stripes[i].dev)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
num_extents++;
|
|
|
|
if (physical == map->stripes[i].physical)
|
|
|
|
cur_extent = i;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_extent_map(em);
|
|
|
|
|
|
|
|
if (num_extents > 1 && cur_extent < num_extents - 1) {
|
|
|
|
/*
|
|
|
|
* Has more stripes on this device. Keep this block group
|
|
|
|
* readonly until we finish all the stripes.
|
|
|
|
*/
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Last stripe on this device */
|
|
|
|
spin_lock(&cache->lock);
|
|
|
|
cache->to_copy = 0;
|
|
|
|
spin_unlock(&cache->lock);
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2018-11-11 22:22:16 +08:00
|
|
|
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
|
2017-02-15 00:55:53 +08:00
|
|
|
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
|
|
|
|
int read_src)
|
2012-11-06 00:33:06 +08:00
|
|
|
{
|
2016-06-23 06:54:24 +08:00
|
|
|
struct btrfs_root *root = fs_info->dev_root;
|
2012-11-06 00:33:06 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
int ret;
|
|
|
|
struct btrfs_device *tgt_device = NULL;
|
|
|
|
struct btrfs_device *src_device = NULL;
|
|
|
|
|
2018-09-03 17:46:14 +08:00
|
|
|
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
|
|
|
|
srcdev_name);
|
|
|
|
if (IS_ERR(src_device))
|
|
|
|
return PTR_ERR(src_device);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
Btrfs: prevent ioctls from interfering with a swap file
A later patch will implement swap file support for Btrfs, but before we
do that, we need to make sure that the various Btrfs ioctls cannot
change a swap file.
When a swap file is active, we must make sure that the extents of the
file are not moved and that they don't become shared. That means that
the following are not safe:
- chattr +c (enable compression)
- reflink
- dedupe
- snapshot
- defrag
Don't allow those to happen on an active swap file.
Additionally, balance, resize, device remove, and device replace are
also unsafe if they affect an active swapfile. Add a red-black tree of
block groups and devices which contain an active swapfile. Relocation
checks each block group against this tree and skips it or errors out for
balance or resize, respectively. Device remove and device replace check
the tree for the device they will operate on.
Note that we don't have to worry about chattr -C (disable nocow), which
we ignore for non-empty files, because an active swapfile must be
non-empty and can't be truncated. We also don't have to worry about
autodefrag because it's only done on COW files. Truncate and fallocate
are already taken care of by the generic code. Device add doesn't do
relocation so it's not an issue, either.
Signed-off-by: Omar Sandoval <osandov@fb.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2016-11-04 01:28:12 +08:00
|
|
|
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
|
|
|
|
btrfs_warn_in_rcu(fs_info,
|
|
|
|
"cannot replace device %s (devid %llu) due to active swapfile",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid);
|
|
|
|
return -ETXTBSY;
|
|
|
|
}
|
|
|
|
|
2015-08-14 18:33:02 +08:00
|
|
|
/*
|
|
|
|
* Here we commit the transaction to make sure commit_total_bytes
|
|
|
|
* of all the devices are updated.
|
|
|
|
*/
|
|
|
|
trans = btrfs_attach_transaction(root);
|
|
|
|
if (!IS_ERR(trans)) {
|
2016-09-10 09:39:03 +08:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
2015-08-14 18:33:02 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
} else if (PTR_ERR(trans) != -ENOENT) {
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
|
2019-05-14 18:54:41 +08:00
|
|
|
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
|
|
|
|
src_device, &tgt_device);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2021-02-04 18:22:11 +08:00
|
|
|
ret = mark_block_group_to_copy(fs_info, src_device);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2018-09-07 03:52:17 +08:00
|
|
|
ASSERT(0);
|
2016-03-24 18:48:14 +08:00
|
|
|
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
|
2019-05-14 18:54:42 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
goto leave;
|
|
|
|
}
|
|
|
|
|
2016-03-24 18:48:14 +08:00
|
|
|
dev_replace->cont_reading_from_srcdev_mode = read_src;
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->srcdev = src_device;
|
|
|
|
dev_replace->tgtdev = tgt_device;
|
|
|
|
|
2016-03-24 18:48:12 +08:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
2015-10-08 15:01:03 +08:00
|
|
|
"dev_replace from %s (devid %llu) to %s started",
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(src_device),
|
2012-11-06 00:33:06 +08:00
|
|
|
src_device->devid,
|
|
|
|
rcu_str_deref(tgt_device->name));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* from now on, the writes to the srcdev are all duplicated to
|
|
|
|
* go to the tgtdev as well (refer to btrfs_map_block()).
|
|
|
|
*/
|
|
|
|
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
|
2018-06-12 19:48:25 +08:00
|
|
|
dev_replace->time_started = ktime_get_real_seconds();
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->cursor_left = 0;
|
|
|
|
dev_replace->committed_cursor_left = 0;
|
|
|
|
dev_replace->cursor_left_last_write_of_item = 0;
|
|
|
|
dev_replace->cursor_right = 0;
|
|
|
|
dev_replace->is_valid = 1;
|
|
|
|
dev_replace->item_needs_writeback = 1;
|
2016-03-30 05:17:48 +08:00
|
|
|
atomic64_set(&dev_replace->num_write_errors, 0);
|
|
|
|
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2020-09-05 01:34:26 +08:00
|
|
|
ret = btrfs_sysfs_add_device(tgt_device);
|
2015-08-14 18:33:07 +08:00
|
|
|
if (ret)
|
2016-09-20 22:05:02 +08:00
|
|
|
btrfs_err(fs_info, "kobj add dev failed %d", ret);
|
2015-08-14 18:33:07 +08:00
|
|
|
|
2017-06-24 00:48:21 +08:00
|
|
|
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2019-05-14 18:54:43 +08:00
|
|
|
/* Commit dev_replace state and reserve 1 item for it. */
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
2012-11-06 00:33:06 +08:00
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
2018-09-07 03:52:17 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
|
|
|
|
dev_replace->srcdev = NULL;
|
|
|
|
dev_replace->tgtdev = NULL;
|
2019-05-14 18:54:42 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
goto leave;
|
|
|
|
}
|
|
|
|
|
2016-09-10 09:39:03 +08:00
|
|
|
ret = btrfs_commit_transaction(trans);
|
2012-11-06 00:33:06 +08:00
|
|
|
WARN_ON(ret);
|
|
|
|
|
|
|
|
/* the disk copy procedure reuses the scrub code */
|
|
|
|
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
|
2014-09-03 21:35:38 +08:00
|
|
|
btrfs_device_get_total_bytes(src_device),
|
2012-11-06 00:33:06 +08:00
|
|
|
&dev_replace->scrub_progress, 0, 1);
|
|
|
|
|
2016-03-24 18:48:12 +08:00
|
|
|
ret = btrfs_dev_replace_finishing(fs_info, ret);
|
2020-01-25 19:35:38 +08:00
|
|
|
if (ret == -EINPROGRESS)
|
2016-03-24 18:48:14 +08:00
|
|
|
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2014-10-13 12:42:12 +08:00
|
|
|
return ret;
|
2012-11-06 00:33:06 +08:00
|
|
|
|
|
|
|
leave:
|
2018-07-21 00:37:51 +08:00
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
2012-11-06 00:33:06 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
|
2016-03-24 18:48:14 +08:00
|
|
|
struct btrfs_ioctl_dev_replace_args *args)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
switch (args->start.cont_reading_from_srcdev_mode) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
|
|
|
|
args->start.tgtdev_name[0] == '\0')
|
|
|
|
return -EINVAL;
|
|
|
|
|
2016-06-23 06:54:24 +08:00
|
|
|
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
|
2016-03-24 18:48:14 +08:00
|
|
|
args->start.srcdevid,
|
|
|
|
args->start.srcdev_name,
|
|
|
|
args->start.cont_reading_from_srcdev_mode);
|
|
|
|
args->result = ret;
|
|
|
|
/* don't warn if EINPROGRESS, someone else might be running scrub */
|
2018-11-11 22:22:24 +08:00
|
|
|
if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
|
|
|
|
ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
|
|
|
|
return 0;
|
2016-03-24 18:48:14 +08:00
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
/*
|
2016-05-20 09:18:45 +08:00
|
|
|
* blocked until all in-flight bios operations are finished.
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
*/
|
|
|
|
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
|
2018-04-05 07:04:49 +08:00
|
|
|
wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
|
|
|
|
&fs_info->dev_replace.bio_counter));
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* we have removed target device, it is safe to allow new bios request.
|
|
|
|
*/
|
|
|
|
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
|
2018-04-05 07:04:49 +08:00
|
|
|
wake_up(&fs_info->dev_replace.replace_wait);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 22:30:16 +08:00
|
|
|
/*
|
|
|
|
* When finishing the device replace, before swapping the source device with the
|
|
|
|
* target device we must update the chunk allocation state in the target device,
|
|
|
|
* as it is empty because replace works by directly copying the chunks and not
|
|
|
|
* through the normal chunk allocation path.
|
|
|
|
*/
|
|
|
|
static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev)
|
|
|
|
{
|
|
|
|
struct extent_state *cached_state = NULL;
|
|
|
|
u64 start = 0;
|
|
|
|
u64 found_start;
|
|
|
|
u64 found_end;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
while (!find_first_extent_bit(&srcdev->alloc_state, start,
|
|
|
|
&found_start, &found_end,
|
|
|
|
CHUNK_ALLOCATED, &cached_state)) {
|
|
|
|
ret = set_extent_bits(&tgtdev->alloc_state, found_start,
|
|
|
|
found_end, CHUNK_ALLOCATED);
|
|
|
|
if (ret)
|
|
|
|
break;
|
|
|
|
start = found_end + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
free_extent_state(cached_state);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-09-05 01:34:36 +08:00
|
|
|
static void btrfs_dev_replace_update_device_in_mapping_tree(
|
|
|
|
struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_device *srcdev,
|
|
|
|
struct btrfs_device *tgtdev)
|
|
|
|
{
|
|
|
|
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
|
|
|
|
struct extent_map *em;
|
|
|
|
struct map_lookup *map;
|
|
|
|
u64 start = 0;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
write_lock(&em_tree->lock);
|
|
|
|
do {
|
|
|
|
em = lookup_extent_mapping(em_tree, start, (u64)-1);
|
|
|
|
if (!em)
|
|
|
|
break;
|
|
|
|
map = em->map_lookup;
|
|
|
|
for (i = 0; i < map->num_stripes; i++)
|
|
|
|
if (srcdev == map->stripes[i].dev)
|
|
|
|
map->stripes[i].dev = tgtdev;
|
|
|
|
start = em->start + em->len;
|
|
|
|
free_extent_map(em);
|
|
|
|
} while (start);
|
|
|
|
write_unlock(&em_tree->lock);
|
|
|
|
}
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
|
|
|
|
int scrub_ret)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct btrfs_device *tgt_device;
|
|
|
|
struct btrfs_device *src_device;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
|
|
|
u8 uuid_tmp[BTRFS_UUID_SIZE];
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
int ret = 0;
|
|
|
|
|
|
|
|
/* don't allow cancel or unmount to disturb the finishing procedure */
|
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
/* was the operation canceled, or is it finished? */
|
|
|
|
if (dev_replace->replace_state !=
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
|
2018-09-07 22:11:23 +08:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
tgt_device = dev_replace->tgtdev;
|
|
|
|
src_device = dev_replace->srcdev;
|
2018-09-07 22:11:23 +08:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* flush all outstanding I/O and inode extent mappings before the
|
|
|
|
* copy operation is declared as being finished
|
|
|
|
*/
|
2021-01-11 18:58:11 +08:00
|
|
|
ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
|
2013-01-22 18:49:33 +08:00
|
|
|
if (ret) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return ret;
|
|
|
|
}
|
2017-06-24 00:48:21 +08:00
|
|
|
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
btrfs: fix readahead hang and use-after-free after removing a device
Very sporadically I had test case btrfs/069 from fstests hanging (for
years, it is not a recent regression), with the following traces in
dmesg/syslog:
[162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started
[162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0
[162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished
[162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds.
[162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000
[162513.514751] Call Trace:
[162513.514761] __schedule+0x5ce/0xd00
[162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.514771] schedule+0x46/0xf0
[162513.514844] wait_current_trans+0xde/0x140 [btrfs]
[162513.514850] ? finish_wait+0x90/0x90
[162513.514864] start_transaction+0x37c/0x5f0 [btrfs]
[162513.514879] transaction_kthread+0xa4/0x170 [btrfs]
[162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs]
[162513.514894] kthread+0x153/0x170
[162513.514897] ? kthread_stop+0x2c0/0x2c0
[162513.514902] ret_from_fork+0x22/0x30
[162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds.
[162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000
[162513.515682] Call Trace:
[162513.515688] __schedule+0x5ce/0xd00
[162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.515697] schedule+0x46/0xf0
[162513.515712] wait_current_trans+0xde/0x140 [btrfs]
[162513.515716] ? finish_wait+0x90/0x90
[162513.515729] start_transaction+0x37c/0x5f0 [btrfs]
[162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
[162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs]
[162513.515758] ? __ia32_sys_fdatasync+0x20/0x20
[162513.515761] iterate_supers+0x87/0xf0
[162513.515765] ksys_sync+0x60/0xb0
[162513.515768] __do_sys_sync+0xa/0x10
[162513.515771] do_syscall_64+0x33/0x80
[162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.515781] RIP: 0033:0x7f5238f50bd7
[162513.515782] Code: Bad RIP value.
[162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
[162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
[162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a
[162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0
[162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a
[162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
[162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds.
[162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000
[162513.516620] Call Trace:
[162513.516625] __schedule+0x5ce/0xd00
[162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.516634] schedule+0x46/0xf0
[162513.516647] wait_current_trans+0xde/0x140 [btrfs]
[162513.516650] ? finish_wait+0x90/0x90
[162513.516662] start_transaction+0x4d7/0x5f0 [btrfs]
[162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs]
[162513.516686] __vfs_setxattr+0x66/0x80
[162513.516691] __vfs_setxattr_noperm+0x70/0x200
[162513.516697] vfs_setxattr+0x6b/0x120
[162513.516703] setxattr+0x125/0x240
[162513.516709] ? lock_acquire+0xb1/0x480
[162513.516712] ? mnt_want_write+0x20/0x50
[162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0
[162513.516723] ? preempt_count_add+0x49/0xa0
[162513.516725] ? __sb_start_write+0x19b/0x290
[162513.516727] ? preempt_count_add+0x49/0xa0
[162513.516732] path_setxattr+0xba/0xd0
[162513.516739] __x64_sys_setxattr+0x27/0x30
[162513.516741] do_syscall_64+0x33/0x80
[162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.516745] RIP: 0033:0x7f5238f56d5a
[162513.516746] Code: Bad RIP value.
[162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc
[162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a
[162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470
[162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700
[162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004
[162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0
[162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds.
[162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000
[162513.517780] Call Trace:
[162513.517786] __schedule+0x5ce/0xd00
[162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.517796] schedule+0x46/0xf0
[162513.517810] wait_current_trans+0xde/0x140 [btrfs]
[162513.517814] ? finish_wait+0x90/0x90
[162513.517829] start_transaction+0x37c/0x5f0 [btrfs]
[162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
[162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs]
[162513.517862] ? __ia32_sys_fdatasync+0x20/0x20
[162513.517865] iterate_supers+0x87/0xf0
[162513.517869] ksys_sync+0x60/0xb0
[162513.517872] __do_sys_sync+0xa/0x10
[162513.517875] do_syscall_64+0x33/0x80
[162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.517881] RIP: 0033:0x7f5238f50bd7
[162513.517883] Code: Bad RIP value.
[162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
[162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
[162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053
[162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0
[162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053
[162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
[162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds.
[162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000
[162513.519160] Call Trace:
[162513.519165] __schedule+0x5ce/0xd00
[162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.519174] schedule+0x46/0xf0
[162513.519190] wait_current_trans+0xde/0x140 [btrfs]
[162513.519193] ? finish_wait+0x90/0x90
[162513.519206] start_transaction+0x4d7/0x5f0 [btrfs]
[162513.519222] btrfs_create+0x57/0x200 [btrfs]
[162513.519230] lookup_open+0x522/0x650
[162513.519246] path_openat+0x2b8/0xa50
[162513.519270] do_filp_open+0x91/0x100
[162513.519275] ? find_held_lock+0x32/0x90
[162513.519280] ? lock_acquired+0x33b/0x470
[162513.519285] ? do_raw_spin_unlock+0x4b/0xc0
[162513.519287] ? _raw_spin_unlock+0x29/0x40
[162513.519295] do_sys_openat2+0x20d/0x2d0
[162513.519300] do_sys_open+0x44/0x80
[162513.519304] do_syscall_64+0x33/0x80
[162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.519309] RIP: 0033:0x7f5238f4a903
[162513.519310] Code: Bad RIP value.
[162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
[162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903
[162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470
[162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002
[162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013
[162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620
[162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds.
[162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002
[162513.520511] Call Trace:
[162513.520516] __schedule+0x5ce/0xd00
[162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.520525] schedule+0x46/0xf0
[162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs]
[162513.520548] ? finish_wait+0x90/0x90
[162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs]
[162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs]
[162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs]
[162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs]
[162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs]
[162513.520643] ? do_sigaction+0xf3/0x240
[162513.520645] ? find_held_lock+0x32/0x90
[162513.520648] ? do_sigaction+0xf3/0x240
[162513.520651] ? lock_acquired+0x33b/0x470
[162513.520655] ? _raw_spin_unlock_irq+0x24/0x50
[162513.520657] ? lockdep_hardirqs_on+0x7d/0x100
[162513.520660] ? _raw_spin_unlock_irq+0x35/0x50
[162513.520662] ? do_sigaction+0xf3/0x240
[162513.520671] ? __x64_sys_ioctl+0x83/0xb0
[162513.520672] __x64_sys_ioctl+0x83/0xb0
[162513.520677] do_syscall_64+0x33/0x80
[162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.520681] RIP: 0033:0x7fc3cd307d87
[162513.520682] Code: Bad RIP value.
[162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87
[162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003
[162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003
[162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001
[162513.520703]
Showing all locks held in the system:
[162513.520712] 1 lock held by khungtaskd/54:
[162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197
[162513.520728] 1 lock held by in:imklog/596:
[162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60
[162513.520782] 1 lock held by btrfs-transacti/1356167:
[162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs]
[162513.520798] 1 lock held by btrfs/1356190:
[162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60
[162513.520805] 1 lock held by fsstress/1356184:
[162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
[162513.520811] 3 locks held by fsstress/1356185:
[162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
[162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120
[162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
[162513.520833] 1 lock held by fsstress/1356196:
[162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
[162513.520838] 3 locks held by fsstress/1356197:
[162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
[162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50
[162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
[162513.520858] 2 locks held by btrfs/1356211:
[162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs]
[162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
This was weird because the stack traces show that a transaction commit,
triggered by a device replace operation, is blocking trying to pause any
running scrubs but there are no stack traces of blocked tasks doing a
scrub.
After poking around with drgn, I noticed there was a scrub task that was
constantly running and blocking for shorts periods of time:
>>> t = find_task(prog, 1356190)
>>> prog.stack_trace(t)
#0 __schedule+0x5ce/0xcfc
#1 schedule+0x46/0xe4
#2 schedule_timeout+0x1df/0x475
#3 btrfs_reada_wait+0xda/0x132
#4 scrub_stripe+0x2a8/0x112f
#5 scrub_chunk+0xcd/0x134
#6 scrub_enumerate_chunks+0x29e/0x5ee
#7 btrfs_scrub_dev+0x2d5/0x91b
#8 btrfs_ioctl+0x7f5/0x36e7
#9 __x64_sys_ioctl+0x83/0xb0
#10 do_syscall_64+0x33/0x77
#11 entry_SYSCALL_64+0x7c/0x156
Which corresponds to:
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
struct btrfs_fs_info *fs_info = rc->fs_info;
while (atomic_read(&rc->elems)) {
if (!atomic_read(&fs_info->reada_works_cnt))
reada_start_machine(fs_info);
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
(HZ + 9) / 10);
}
(...)
So the counter "rc->elems" was set to 1 and never decreased to 0, causing
the scrub task to loop forever in that function. Then I used the following
script for drgn to check the readahead requests:
$ cat dump_reada.py
import sys
import drgn
from drgn import NULL, Object, cast, container_of, execscript, \
reinterpret, sizeof
from drgn.helpers.linux import *
mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
mnt = None
for mnt in for_each_mount(prog, dst = mnt_path):
pass
if mnt is None:
sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
sys.exit(1)
fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
def dump_re(re):
nzones = re.nzones.value_()
print(f're at {hex(re.value_())}')
print(f'\t logical {re.logical.value_()}')
print(f'\t refcnt {re.refcnt.value_()}')
print(f'\t nzones {nzones}')
for i in range(nzones):
dev = re.zones[i].device
name = dev.name.str.string_()
print(f'\t\t dev id {dev.devid.value_()} name {name}')
print()
for _, e in radix_tree_for_each(fs_info.reada_tree):
re = cast('struct reada_extent *', e)
dump_re(re)
$ drgn dump_reada.py
re at 0xffff8f3da9d25ad8
logical 38928384
refcnt 1
nzones 1
dev id 0 name b'/dev/sdd'
$
So there was one readahead extent with a single zone corresponding to the
source device of that last device replace operation logged in dmesg/syslog.
Also the ID of that zone's device was 0 which is a special value set in
the source device of a device replace operation when the operation finishes
(constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()),
confirming again that device /dev/sdd was the source of a device replace
operation.
Normally there should be as many zones in the readahead extent as there are
devices, and I wasn't expecting the extent to be in a block group with a
'single' profile, so I went and confirmed with the following drgn script
that there weren't any single profile block groups:
$ cat dump_block_groups.py
import sys
import drgn
from drgn import NULL, Object, cast, container_of, execscript, \
reinterpret, sizeof
from drgn.helpers.linux import *
mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
mnt = None
for mnt in for_each_mount(prog, dst = mnt_path):
pass
if mnt is None:
sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
sys.exit(1)
fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
BTRFS_BLOCK_GROUP_DATA = (1 << 0)
BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1)
BTRFS_BLOCK_GROUP_METADATA = (1 << 2)
BTRFS_BLOCK_GROUP_RAID0 = (1 << 3)
BTRFS_BLOCK_GROUP_RAID1 = (1 << 4)
BTRFS_BLOCK_GROUP_DUP = (1 << 5)
BTRFS_BLOCK_GROUP_RAID10 = (1 << 6)
BTRFS_BLOCK_GROUP_RAID5 = (1 << 7)
BTRFS_BLOCK_GROUP_RAID6 = (1 << 8)
BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9)
BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10)
def bg_flags_string(bg):
flags = bg.flags.value_()
ret = ''
if flags & BTRFS_BLOCK_GROUP_DATA:
ret = 'data'
if flags & BTRFS_BLOCK_GROUP_METADATA:
if len(ret) > 0:
ret += '|'
ret += 'meta'
if flags & BTRFS_BLOCK_GROUP_SYSTEM:
if len(ret) > 0:
ret += '|'
ret += 'system'
if flags & BTRFS_BLOCK_GROUP_RAID0:
ret += ' raid0'
elif flags & BTRFS_BLOCK_GROUP_RAID1:
ret += ' raid1'
elif flags & BTRFS_BLOCK_GROUP_DUP:
ret += ' dup'
elif flags & BTRFS_BLOCK_GROUP_RAID10:
ret += ' raid10'
elif flags & BTRFS_BLOCK_GROUP_RAID5:
ret += ' raid5'
elif flags & BTRFS_BLOCK_GROUP_RAID6:
ret += ' raid6'
elif flags & BTRFS_BLOCK_GROUP_RAID1C3:
ret += ' raid1c3'
elif flags & BTRFS_BLOCK_GROUP_RAID1C4:
ret += ' raid1c4'
else:
ret += ' single'
return ret
def dump_bg(bg):
print()
print(f'block group at {hex(bg.value_())}')
print(f'\t start {bg.start.value_()} length {bg.length.value_()}')
print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}')
bg_root = fs_info.block_group_cache_tree.address_of_()
for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'):
dump_bg(bg)
$ drgn dump_block_groups.py
block group at 0xffff8f3d673b0400
start 22020096 length 16777216
flags 258 - system raid6
block group at 0xffff8f3d53ddb400
start 38797312 length 536870912
flags 260 - meta raid6
block group at 0xffff8f3d5f4d9c00
start 575668224 length 2147483648
flags 257 - data raid6
block group at 0xffff8f3d08189000
start 2723151872 length 67108864
flags 258 - system raid6
block group at 0xffff8f3db70ff000
start 2790260736 length 1073741824
flags 260 - meta raid6
block group at 0xffff8f3d5f4dd800
start 3864002560 length 67108864
flags 258 - system raid6
block group at 0xffff8f3d67037000
start 3931111424 length 2147483648
flags 257 - data raid6
$
So there were only 2 reasons left for having a readahead extent with a
single zone: reada_find_zone(), called when creating a readahead extent,
returned NULL either because we failed to find the corresponding block
group or because a memory allocation failed. With some additional and
custom tracing I figured out that on every further ocurrence of the
problem the block group had just been deleted when we were looping to
create the zones for the readahead extent (at reada_find_extent()), so we
ended up with only one zone in the readahead extent, corresponding to a
device that ends up getting replaced.
So after figuring that out it became obvious why the hang happens:
1) Task A starts a scrub on any device of the filesystem, except for
device /dev/sdd;
2) Task B starts a device replace with /dev/sdd as the source device;
3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently
starting to scrub a stripe from block group X. This call to
btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add()
calls reada_add_block(), it passes the logical address of the extent
tree's root node as its 'logical' argument - a value of 38928384;
4) Task A then enters reada_find_extent(), called from reada_add_block().
It finds there isn't any existing readahead extent for the logical
address 38928384, so it proceeds to the path of creating a new one.
It calls btrfs_map_block() to find out which stripes exist for the block
group X. On the first iteration of the for loop that iterates over the
stripes, it finds the stripe for device /dev/sdd, so it creates one
zone for that device and adds it to the readahead extent. Before getting
into the second iteration of the loop, the cleanup kthread deletes block
group X because it was empty. So in the iterations for the remaining
stripes it does not add more zones to the readahead extent, because the
calls to reada_find_zone() returned NULL because they couldn't find
block group X anymore.
As a result the new readahead extent has a single zone, corresponding to
the device /dev/sdd;
4) Before task A returns to btrfs_reada_add() and queues the readahead job
for the readahead work queue, task B finishes the device replace and at
btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new
device /dev/sdg;
5) Task A returns to reada_add_block(), which increments the counter
"->elems" of the reada_control structure allocated at btrfs_reada_add().
Then it returns back to btrfs_reada_add() and calls
reada_start_machine(). This queues a job in the readahead work queue to
run the function reada_start_machine_worker(), which calls
__reada_start_machine().
At __reada_start_machine() we take the device list mutex and for each
device found in the current device list, we call
reada_start_machine_dev() to start the readahead work. However at this
point the device /dev/sdd was already freed and is not in the device
list anymore.
This means the corresponding readahead for the extent at 38928384 is
never started, and therefore the "->elems" counter of the reada_control
structure allocated at btrfs_reada_add() never goes down to 0, causing
the call to btrfs_reada_wait(), done by the scrub task, to wait forever.
Note that the readahead request can be made either after the device replace
started or before it started, however in pratice it is very unlikely that a
device replace is able to start after a readahead request is made and is
able to complete before the readahead request completes - maybe only on a
very small and nearly empty filesystem.
This hang however is not the only problem we can have with readahead and
device removals. When the readahead extent has other zones other than the
one corresponding to the device that is being removed (either by a device
replace or a device remove operation), we risk having a use-after-free on
the device when dropping the last reference of the readahead extent.
For example if we create a readahead extent with two zones, one for the
device /dev/sdd and one for the device /dev/sde:
1) Before the readahead worker starts, the device /dev/sdd is removed,
and the corresponding btrfs_device structure is freed. However the
readahead extent still has the zone pointing to the device structure;
2) When the readahead worker starts, it only finds device /dev/sde in the
current device list of the filesystem;
3) It starts the readahead work, at reada_start_machine_dev(), using the
device /dev/sde;
4) Then when it finishes reading the extent from device /dev/sde, it calls
__readahead_hook() which ends up dropping the last reference on the
readahead extent through the last call to reada_extent_put();
5) At reada_extent_put() it iterates over each zone of the readahead extent
and attempts to delete an element from the device's 'reada_extents'
radix tree, resulting in a use-after-free, as the device pointer of the
zone for /dev/sdd is now stale. We can also access the device after
dropping the last reference of a zone, through reada_zone_release(),
also called by reada_extent_put().
And a device remove suffers the same problem, however since it shrinks the
device size down to zero before removing the device, it is very unlikely to
still have readahead requests not completed by the time we free the device,
the only possibility is if the device has a very little space allocated.
While the hang problem is exclusive to scrub, since it is currently the
only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free
problem affects any path that triggers readhead, which includes
btree_readahead_hook() and __readahead_hook() (a readahead worker can
trigger readahed for the children of a node) for example - any path that
ends up calling reada_add_block() can trigger the use-after-free after a
device is removed.
So fix this by waiting for any readahead requests for a device to complete
before removing a device, ensuring that while waiting for existing ones no
new ones can be made.
This problem has been around for a very long time - the readahead code was
added in 2011, device remove exists since 2008 and device replace was
introduced in 2013, hard to pick a specific commit for a git Fixes tag.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-10-12 18:55:24 +08:00
|
|
|
if (!scrub_ret)
|
|
|
|
btrfs_reada_remove_dev(src_device);
|
|
|
|
|
2019-05-17 15:44:25 +08:00
|
|
|
/*
|
|
|
|
* We have to use this loop approach because at this point src_device
|
|
|
|
* has to be available for transaction commit to complete, yet new
|
|
|
|
* chunks shouldn't be allocated on the device.
|
|
|
|
*/
|
|
|
|
while (1) {
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
btrfs: fix readahead hang and use-after-free after removing a device
Very sporadically I had test case btrfs/069 from fstests hanging (for
years, it is not a recent regression), with the following traces in
dmesg/syslog:
[162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started
[162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0
[162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished
[162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds.
[162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000
[162513.514751] Call Trace:
[162513.514761] __schedule+0x5ce/0xd00
[162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.514771] schedule+0x46/0xf0
[162513.514844] wait_current_trans+0xde/0x140 [btrfs]
[162513.514850] ? finish_wait+0x90/0x90
[162513.514864] start_transaction+0x37c/0x5f0 [btrfs]
[162513.514879] transaction_kthread+0xa4/0x170 [btrfs]
[162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs]
[162513.514894] kthread+0x153/0x170
[162513.514897] ? kthread_stop+0x2c0/0x2c0
[162513.514902] ret_from_fork+0x22/0x30
[162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds.
[162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000
[162513.515682] Call Trace:
[162513.515688] __schedule+0x5ce/0xd00
[162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.515697] schedule+0x46/0xf0
[162513.515712] wait_current_trans+0xde/0x140 [btrfs]
[162513.515716] ? finish_wait+0x90/0x90
[162513.515729] start_transaction+0x37c/0x5f0 [btrfs]
[162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
[162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs]
[162513.515758] ? __ia32_sys_fdatasync+0x20/0x20
[162513.515761] iterate_supers+0x87/0xf0
[162513.515765] ksys_sync+0x60/0xb0
[162513.515768] __do_sys_sync+0xa/0x10
[162513.515771] do_syscall_64+0x33/0x80
[162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.515781] RIP: 0033:0x7f5238f50bd7
[162513.515782] Code: Bad RIP value.
[162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
[162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
[162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a
[162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0
[162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a
[162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
[162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds.
[162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000
[162513.516620] Call Trace:
[162513.516625] __schedule+0x5ce/0xd00
[162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.516634] schedule+0x46/0xf0
[162513.516647] wait_current_trans+0xde/0x140 [btrfs]
[162513.516650] ? finish_wait+0x90/0x90
[162513.516662] start_transaction+0x4d7/0x5f0 [btrfs]
[162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs]
[162513.516686] __vfs_setxattr+0x66/0x80
[162513.516691] __vfs_setxattr_noperm+0x70/0x200
[162513.516697] vfs_setxattr+0x6b/0x120
[162513.516703] setxattr+0x125/0x240
[162513.516709] ? lock_acquire+0xb1/0x480
[162513.516712] ? mnt_want_write+0x20/0x50
[162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0
[162513.516723] ? preempt_count_add+0x49/0xa0
[162513.516725] ? __sb_start_write+0x19b/0x290
[162513.516727] ? preempt_count_add+0x49/0xa0
[162513.516732] path_setxattr+0xba/0xd0
[162513.516739] __x64_sys_setxattr+0x27/0x30
[162513.516741] do_syscall_64+0x33/0x80
[162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.516745] RIP: 0033:0x7f5238f56d5a
[162513.516746] Code: Bad RIP value.
[162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc
[162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a
[162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470
[162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700
[162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004
[162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0
[162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds.
[162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000
[162513.517780] Call Trace:
[162513.517786] __schedule+0x5ce/0xd00
[162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.517796] schedule+0x46/0xf0
[162513.517810] wait_current_trans+0xde/0x140 [btrfs]
[162513.517814] ? finish_wait+0x90/0x90
[162513.517829] start_transaction+0x37c/0x5f0 [btrfs]
[162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
[162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs]
[162513.517862] ? __ia32_sys_fdatasync+0x20/0x20
[162513.517865] iterate_supers+0x87/0xf0
[162513.517869] ksys_sync+0x60/0xb0
[162513.517872] __do_sys_sync+0xa/0x10
[162513.517875] do_syscall_64+0x33/0x80
[162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.517881] RIP: 0033:0x7f5238f50bd7
[162513.517883] Code: Bad RIP value.
[162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
[162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
[162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053
[162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0
[162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053
[162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
[162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds.
[162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000
[162513.519160] Call Trace:
[162513.519165] __schedule+0x5ce/0xd00
[162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.519174] schedule+0x46/0xf0
[162513.519190] wait_current_trans+0xde/0x140 [btrfs]
[162513.519193] ? finish_wait+0x90/0x90
[162513.519206] start_transaction+0x4d7/0x5f0 [btrfs]
[162513.519222] btrfs_create+0x57/0x200 [btrfs]
[162513.519230] lookup_open+0x522/0x650
[162513.519246] path_openat+0x2b8/0xa50
[162513.519270] do_filp_open+0x91/0x100
[162513.519275] ? find_held_lock+0x32/0x90
[162513.519280] ? lock_acquired+0x33b/0x470
[162513.519285] ? do_raw_spin_unlock+0x4b/0xc0
[162513.519287] ? _raw_spin_unlock+0x29/0x40
[162513.519295] do_sys_openat2+0x20d/0x2d0
[162513.519300] do_sys_open+0x44/0x80
[162513.519304] do_syscall_64+0x33/0x80
[162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.519309] RIP: 0033:0x7f5238f4a903
[162513.519310] Code: Bad RIP value.
[162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
[162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903
[162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470
[162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002
[162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013
[162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620
[162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds.
[162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002
[162513.520511] Call Trace:
[162513.520516] __schedule+0x5ce/0xd00
[162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.520525] schedule+0x46/0xf0
[162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs]
[162513.520548] ? finish_wait+0x90/0x90
[162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs]
[162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs]
[162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs]
[162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs]
[162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs]
[162513.520643] ? do_sigaction+0xf3/0x240
[162513.520645] ? find_held_lock+0x32/0x90
[162513.520648] ? do_sigaction+0xf3/0x240
[162513.520651] ? lock_acquired+0x33b/0x470
[162513.520655] ? _raw_spin_unlock_irq+0x24/0x50
[162513.520657] ? lockdep_hardirqs_on+0x7d/0x100
[162513.520660] ? _raw_spin_unlock_irq+0x35/0x50
[162513.520662] ? do_sigaction+0xf3/0x240
[162513.520671] ? __x64_sys_ioctl+0x83/0xb0
[162513.520672] __x64_sys_ioctl+0x83/0xb0
[162513.520677] do_syscall_64+0x33/0x80
[162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.520681] RIP: 0033:0x7fc3cd307d87
[162513.520682] Code: Bad RIP value.
[162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87
[162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003
[162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003
[162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001
[162513.520703]
Showing all locks held in the system:
[162513.520712] 1 lock held by khungtaskd/54:
[162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197
[162513.520728] 1 lock held by in:imklog/596:
[162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60
[162513.520782] 1 lock held by btrfs-transacti/1356167:
[162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs]
[162513.520798] 1 lock held by btrfs/1356190:
[162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60
[162513.520805] 1 lock held by fsstress/1356184:
[162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
[162513.520811] 3 locks held by fsstress/1356185:
[162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
[162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120
[162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
[162513.520833] 1 lock held by fsstress/1356196:
[162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
[162513.520838] 3 locks held by fsstress/1356197:
[162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
[162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50
[162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
[162513.520858] 2 locks held by btrfs/1356211:
[162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs]
[162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
This was weird because the stack traces show that a transaction commit,
triggered by a device replace operation, is blocking trying to pause any
running scrubs but there are no stack traces of blocked tasks doing a
scrub.
After poking around with drgn, I noticed there was a scrub task that was
constantly running and blocking for shorts periods of time:
>>> t = find_task(prog, 1356190)
>>> prog.stack_trace(t)
#0 __schedule+0x5ce/0xcfc
#1 schedule+0x46/0xe4
#2 schedule_timeout+0x1df/0x475
#3 btrfs_reada_wait+0xda/0x132
#4 scrub_stripe+0x2a8/0x112f
#5 scrub_chunk+0xcd/0x134
#6 scrub_enumerate_chunks+0x29e/0x5ee
#7 btrfs_scrub_dev+0x2d5/0x91b
#8 btrfs_ioctl+0x7f5/0x36e7
#9 __x64_sys_ioctl+0x83/0xb0
#10 do_syscall_64+0x33/0x77
#11 entry_SYSCALL_64+0x7c/0x156
Which corresponds to:
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
struct btrfs_fs_info *fs_info = rc->fs_info;
while (atomic_read(&rc->elems)) {
if (!atomic_read(&fs_info->reada_works_cnt))
reada_start_machine(fs_info);
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
(HZ + 9) / 10);
}
(...)
So the counter "rc->elems" was set to 1 and never decreased to 0, causing
the scrub task to loop forever in that function. Then I used the following
script for drgn to check the readahead requests:
$ cat dump_reada.py
import sys
import drgn
from drgn import NULL, Object, cast, container_of, execscript, \
reinterpret, sizeof
from drgn.helpers.linux import *
mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
mnt = None
for mnt in for_each_mount(prog, dst = mnt_path):
pass
if mnt is None:
sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
sys.exit(1)
fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
def dump_re(re):
nzones = re.nzones.value_()
print(f're at {hex(re.value_())}')
print(f'\t logical {re.logical.value_()}')
print(f'\t refcnt {re.refcnt.value_()}')
print(f'\t nzones {nzones}')
for i in range(nzones):
dev = re.zones[i].device
name = dev.name.str.string_()
print(f'\t\t dev id {dev.devid.value_()} name {name}')
print()
for _, e in radix_tree_for_each(fs_info.reada_tree):
re = cast('struct reada_extent *', e)
dump_re(re)
$ drgn dump_reada.py
re at 0xffff8f3da9d25ad8
logical 38928384
refcnt 1
nzones 1
dev id 0 name b'/dev/sdd'
$
So there was one readahead extent with a single zone corresponding to the
source device of that last device replace operation logged in dmesg/syslog.
Also the ID of that zone's device was 0 which is a special value set in
the source device of a device replace operation when the operation finishes
(constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()),
confirming again that device /dev/sdd was the source of a device replace
operation.
Normally there should be as many zones in the readahead extent as there are
devices, and I wasn't expecting the extent to be in a block group with a
'single' profile, so I went and confirmed with the following drgn script
that there weren't any single profile block groups:
$ cat dump_block_groups.py
import sys
import drgn
from drgn import NULL, Object, cast, container_of, execscript, \
reinterpret, sizeof
from drgn.helpers.linux import *
mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
mnt = None
for mnt in for_each_mount(prog, dst = mnt_path):
pass
if mnt is None:
sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
sys.exit(1)
fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
BTRFS_BLOCK_GROUP_DATA = (1 << 0)
BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1)
BTRFS_BLOCK_GROUP_METADATA = (1 << 2)
BTRFS_BLOCK_GROUP_RAID0 = (1 << 3)
BTRFS_BLOCK_GROUP_RAID1 = (1 << 4)
BTRFS_BLOCK_GROUP_DUP = (1 << 5)
BTRFS_BLOCK_GROUP_RAID10 = (1 << 6)
BTRFS_BLOCK_GROUP_RAID5 = (1 << 7)
BTRFS_BLOCK_GROUP_RAID6 = (1 << 8)
BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9)
BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10)
def bg_flags_string(bg):
flags = bg.flags.value_()
ret = ''
if flags & BTRFS_BLOCK_GROUP_DATA:
ret = 'data'
if flags & BTRFS_BLOCK_GROUP_METADATA:
if len(ret) > 0:
ret += '|'
ret += 'meta'
if flags & BTRFS_BLOCK_GROUP_SYSTEM:
if len(ret) > 0:
ret += '|'
ret += 'system'
if flags & BTRFS_BLOCK_GROUP_RAID0:
ret += ' raid0'
elif flags & BTRFS_BLOCK_GROUP_RAID1:
ret += ' raid1'
elif flags & BTRFS_BLOCK_GROUP_DUP:
ret += ' dup'
elif flags & BTRFS_BLOCK_GROUP_RAID10:
ret += ' raid10'
elif flags & BTRFS_BLOCK_GROUP_RAID5:
ret += ' raid5'
elif flags & BTRFS_BLOCK_GROUP_RAID6:
ret += ' raid6'
elif flags & BTRFS_BLOCK_GROUP_RAID1C3:
ret += ' raid1c3'
elif flags & BTRFS_BLOCK_GROUP_RAID1C4:
ret += ' raid1c4'
else:
ret += ' single'
return ret
def dump_bg(bg):
print()
print(f'block group at {hex(bg.value_())}')
print(f'\t start {bg.start.value_()} length {bg.length.value_()}')
print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}')
bg_root = fs_info.block_group_cache_tree.address_of_()
for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'):
dump_bg(bg)
$ drgn dump_block_groups.py
block group at 0xffff8f3d673b0400
start 22020096 length 16777216
flags 258 - system raid6
block group at 0xffff8f3d53ddb400
start 38797312 length 536870912
flags 260 - meta raid6
block group at 0xffff8f3d5f4d9c00
start 575668224 length 2147483648
flags 257 - data raid6
block group at 0xffff8f3d08189000
start 2723151872 length 67108864
flags 258 - system raid6
block group at 0xffff8f3db70ff000
start 2790260736 length 1073741824
flags 260 - meta raid6
block group at 0xffff8f3d5f4dd800
start 3864002560 length 67108864
flags 258 - system raid6
block group at 0xffff8f3d67037000
start 3931111424 length 2147483648
flags 257 - data raid6
$
So there were only 2 reasons left for having a readahead extent with a
single zone: reada_find_zone(), called when creating a readahead extent,
returned NULL either because we failed to find the corresponding block
group or because a memory allocation failed. With some additional and
custom tracing I figured out that on every further ocurrence of the
problem the block group had just been deleted when we were looping to
create the zones for the readahead extent (at reada_find_extent()), so we
ended up with only one zone in the readahead extent, corresponding to a
device that ends up getting replaced.
So after figuring that out it became obvious why the hang happens:
1) Task A starts a scrub on any device of the filesystem, except for
device /dev/sdd;
2) Task B starts a device replace with /dev/sdd as the source device;
3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently
starting to scrub a stripe from block group X. This call to
btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add()
calls reada_add_block(), it passes the logical address of the extent
tree's root node as its 'logical' argument - a value of 38928384;
4) Task A then enters reada_find_extent(), called from reada_add_block().
It finds there isn't any existing readahead extent for the logical
address 38928384, so it proceeds to the path of creating a new one.
It calls btrfs_map_block() to find out which stripes exist for the block
group X. On the first iteration of the for loop that iterates over the
stripes, it finds the stripe for device /dev/sdd, so it creates one
zone for that device and adds it to the readahead extent. Before getting
into the second iteration of the loop, the cleanup kthread deletes block
group X because it was empty. So in the iterations for the remaining
stripes it does not add more zones to the readahead extent, because the
calls to reada_find_zone() returned NULL because they couldn't find
block group X anymore.
As a result the new readahead extent has a single zone, corresponding to
the device /dev/sdd;
4) Before task A returns to btrfs_reada_add() and queues the readahead job
for the readahead work queue, task B finishes the device replace and at
btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new
device /dev/sdg;
5) Task A returns to reada_add_block(), which increments the counter
"->elems" of the reada_control structure allocated at btrfs_reada_add().
Then it returns back to btrfs_reada_add() and calls
reada_start_machine(). This queues a job in the readahead work queue to
run the function reada_start_machine_worker(), which calls
__reada_start_machine().
At __reada_start_machine() we take the device list mutex and for each
device found in the current device list, we call
reada_start_machine_dev() to start the readahead work. However at this
point the device /dev/sdd was already freed and is not in the device
list anymore.
This means the corresponding readahead for the extent at 38928384 is
never started, and therefore the "->elems" counter of the reada_control
structure allocated at btrfs_reada_add() never goes down to 0, causing
the call to btrfs_reada_wait(), done by the scrub task, to wait forever.
Note that the readahead request can be made either after the device replace
started or before it started, however in pratice it is very unlikely that a
device replace is able to start after a readahead request is made and is
able to complete before the readahead request completes - maybe only on a
very small and nearly empty filesystem.
This hang however is not the only problem we can have with readahead and
device removals. When the readahead extent has other zones other than the
one corresponding to the device that is being removed (either by a device
replace or a device remove operation), we risk having a use-after-free on
the device when dropping the last reference of the readahead extent.
For example if we create a readahead extent with two zones, one for the
device /dev/sdd and one for the device /dev/sde:
1) Before the readahead worker starts, the device /dev/sdd is removed,
and the corresponding btrfs_device structure is freed. However the
readahead extent still has the zone pointing to the device structure;
2) When the readahead worker starts, it only finds device /dev/sde in the
current device list of the filesystem;
3) It starts the readahead work, at reada_start_machine_dev(), using the
device /dev/sde;
4) Then when it finishes reading the extent from device /dev/sde, it calls
__readahead_hook() which ends up dropping the last reference on the
readahead extent through the last call to reada_extent_put();
5) At reada_extent_put() it iterates over each zone of the readahead extent
and attempts to delete an element from the device's 'reada_extents'
radix tree, resulting in a use-after-free, as the device pointer of the
zone for /dev/sdd is now stale. We can also access the device after
dropping the last reference of a zone, through reada_zone_release(),
also called by reada_extent_put().
And a device remove suffers the same problem, however since it shrinks the
device size down to zero before removing the device, it is very unlikely to
still have readahead requests not completed by the time we free the device,
the only possibility is if the device has a very little space allocated.
While the hang problem is exclusive to scrub, since it is currently the
only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free
problem affects any path that triggers readhead, which includes
btree_readahead_hook() and __readahead_hook() (a readahead worker can
trigger readahed for the children of a node) for example - any path that
ends up calling reada_add_block() can trigger the use-after-free after a
device is removed.
So fix this by waiting for any readahead requests for a device to complete
before removing a device, ensuring that while waiting for existing ones no
new ones can be made.
This problem has been around for a very long time - the readahead code was
added in 2011, device remove exists since 2008 and device replace was
introduced in 2013, hard to pick a specific commit for a git Fixes tag.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-10-12 18:55:24 +08:00
|
|
|
btrfs_reada_undo_remove_dev(src_device);
|
2019-05-17 15:44:25 +08:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
WARN_ON(ret);
|
|
|
|
|
|
|
|
/* Prevent write_all_supers() during the finishing procedure */
|
|
|
|
mutex_lock(&fs_info->fs_devices->device_list_mutex);
|
|
|
|
/* Prevent new chunks being allocated on the source device */
|
|
|
|
mutex_lock(&fs_info->chunk_mutex);
|
|
|
|
|
|
|
|
if (!list_empty(&src_device->post_commit_list)) {
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
|
|
|
|
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
|
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->srcdev = NULL;
|
2018-06-12 19:48:25 +08:00
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->item_needs_writeback = 1;
|
|
|
|
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 22:30:16 +08:00
|
|
|
/*
|
|
|
|
* Update allocation state in the new device and replace the old device
|
|
|
|
* with the new one in the mapping tree.
|
|
|
|
*/
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
if (!scrub_ret) {
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 22:30:16 +08:00
|
|
|
scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
|
|
|
|
if (scrub_ret)
|
|
|
|
goto error;
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
|
|
|
|
src_device,
|
|
|
|
tgt_device);
|
|
|
|
} else {
|
2018-11-20 19:56:16 +08:00
|
|
|
if (scrub_ret != -ECANCELED)
|
|
|
|
btrfs_err_in_rcu(fs_info,
|
2016-06-23 06:54:23 +08:00
|
|
|
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(src_device),
|
2016-06-23 06:54:23 +08:00
|
|
|
src_device->devid,
|
|
|
|
rcu_str_deref(tgt_device->name), scrub_ret);
|
btrfs: fix filesystem corruption after a device replace
We use a device's allocation state tree to track ranges in a device used
for allocated chunks, and we set ranges in this tree when allocating a new
chunk. However after a device replace operation, we were not setting the
allocated ranges in the new device's allocation state tree, so that tree
is empty after a device replace.
This means that a fitrim operation after a device replace will trim the
device ranges that have allocated chunks and extents, as we trim every
range for which there is not a range marked in the device's allocation
state tree. It is also important during chunk allocation, since the
device's allocation state is used to determine if a range is already
allocated when allocating a new chunk.
This is trivial to reproduce and the following script triggers the bug:
$ cat reproducer.sh
#!/bin/bash
DEV1="/dev/sdg"
DEV2="/dev/sdh"
DEV3="/dev/sdi"
wipefs -a $DEV1 $DEV2 $DEV3 &> /dev/null
# Create a raid1 test fs on 2 devices.
mkfs.btrfs -f -m raid1 -d raid1 $DEV1 $DEV2 > /dev/null
mount $DEV1 /mnt/btrfs
xfs_io -f -c "pwrite -S 0xab 0 10M" /mnt/btrfs/foo
echo "Starting to replace $DEV1 with $DEV3"
btrfs replace start -B $DEV1 $DEV3 /mnt/btrfs
echo
echo "Running fstrim"
fstrim /mnt/btrfs
echo
echo "Unmounting filesystem"
umount /mnt/btrfs
echo "Mounting filesystem in degraded mode using $DEV3 only"
wipefs -a $DEV1 $DEV2 &> /dev/null
mount -o degraded $DEV3 /mnt/btrfs
if [ $? -ne 0 ]; then
dmesg | tail
echo
echo "Failed to mount in degraded mode"
exit 1
fi
echo
echo "File foo data (expected all bytes = 0xab):"
od -A d -t x1 /mnt/btrfs/foo
umount /mnt/btrfs
When running the reproducer:
$ ./replace-test.sh
wrote 10485760/10485760 bytes at offset 0
10 MiB, 2560 ops; 0.0901 sec (110.877 MiB/sec and 28384.5216 ops/sec)
Starting to replace /dev/sdg with /dev/sdi
Running fstrim
Unmounting filesystem
Mounting filesystem in degraded mode using /dev/sdi only
mount: /mnt/btrfs: wrong fs type, bad option, bad superblock on /dev/sdi, missing codepage or helper program, or other error.
[19581.748641] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi started
[19581.803842] BTRFS info (device sdg): dev_replace from /dev/sdg (devid 1) to /dev/sdi finished
[19582.208293] BTRFS info (device sdi): allowing degraded mounts
[19582.208298] BTRFS info (device sdi): disk space caching is enabled
[19582.208301] BTRFS info (device sdi): has skinny extents
[19582.212853] BTRFS warning (device sdi): devid 2 uuid 1f731f47-e1bb-4f00-bfbb-9e5a0cb4ba9f is missing
[19582.213904] btree_readpage_end_io_hook: 25839 callbacks suppressed
[19582.213907] BTRFS error (device sdi): bad tree block start, want 30490624 have 0
[19582.214780] BTRFS warning (device sdi): failed to read root (objectid=7): -5
[19582.231576] BTRFS error (device sdi): open_ctree failed
Failed to mount in degraded mode
So fix by setting all allocated ranges in the replace target device when
the replace operation is finishing, when we are holding the chunk mutex
and we can not race with new chunk allocations.
A test case for fstests follows soon.
Fixes: 1c11b63eff2a67 ("btrfs: replace pending/pinned chunks lists with io tree")
CC: stable@vger.kernel.org # 5.2+
Reviewed-by: Nikolay Borisov <nborisov@suse.com>
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-23 22:30:16 +08:00
|
|
|
error:
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
btrfs: fix readahead hang and use-after-free after removing a device
Very sporadically I had test case btrfs/069 from fstests hanging (for
years, it is not a recent regression), with the following traces in
dmesg/syslog:
[162301.160628] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg started
[162301.181196] BTRFS info (device sdc): scrub: finished on devid 4 with status: 0
[162301.287162] BTRFS info (device sdc): dev_replace from /dev/sdd (devid 2) to /dev/sdg finished
[162513.513792] INFO: task btrfs-transacti:1356167 blocked for more than 120 seconds.
[162513.514318] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.514522] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.514747] task:btrfs-transacti state:D stack: 0 pid:1356167 ppid: 2 flags:0x00004000
[162513.514751] Call Trace:
[162513.514761] __schedule+0x5ce/0xd00
[162513.514765] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.514771] schedule+0x46/0xf0
[162513.514844] wait_current_trans+0xde/0x140 [btrfs]
[162513.514850] ? finish_wait+0x90/0x90
[162513.514864] start_transaction+0x37c/0x5f0 [btrfs]
[162513.514879] transaction_kthread+0xa4/0x170 [btrfs]
[162513.514891] ? btrfs_cleanup_transaction+0x660/0x660 [btrfs]
[162513.514894] kthread+0x153/0x170
[162513.514897] ? kthread_stop+0x2c0/0x2c0
[162513.514902] ret_from_fork+0x22/0x30
[162513.514916] INFO: task fsstress:1356184 blocked for more than 120 seconds.
[162513.515192] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.515431] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.515680] task:fsstress state:D stack: 0 pid:1356184 ppid:1356177 flags:0x00004000
[162513.515682] Call Trace:
[162513.515688] __schedule+0x5ce/0xd00
[162513.515691] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.515697] schedule+0x46/0xf0
[162513.515712] wait_current_trans+0xde/0x140 [btrfs]
[162513.515716] ? finish_wait+0x90/0x90
[162513.515729] start_transaction+0x37c/0x5f0 [btrfs]
[162513.515743] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
[162513.515753] btrfs_sync_fs+0x61/0x1c0 [btrfs]
[162513.515758] ? __ia32_sys_fdatasync+0x20/0x20
[162513.515761] iterate_supers+0x87/0xf0
[162513.515765] ksys_sync+0x60/0xb0
[162513.515768] __do_sys_sync+0xa/0x10
[162513.515771] do_syscall_64+0x33/0x80
[162513.515774] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.515781] RIP: 0033:0x7f5238f50bd7
[162513.515782] Code: Bad RIP value.
[162513.515784] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
[162513.515786] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
[162513.515788] RDX: 00000000ffffffff RSI: 000000000daf0e74 RDI: 000000000000003a
[162513.515789] RBP: 0000000000000032 R08: 000000000000000a R09: 00007f5239019be0
[162513.515791] R10: fffffffffffff24f R11: 0000000000000206 R12: 000000000000003a
[162513.515792] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
[162513.515804] INFO: task fsstress:1356185 blocked for more than 120 seconds.
[162513.516064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.516329] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.516617] task:fsstress state:D stack: 0 pid:1356185 ppid:1356177 flags:0x00000000
[162513.516620] Call Trace:
[162513.516625] __schedule+0x5ce/0xd00
[162513.516628] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.516634] schedule+0x46/0xf0
[162513.516647] wait_current_trans+0xde/0x140 [btrfs]
[162513.516650] ? finish_wait+0x90/0x90
[162513.516662] start_transaction+0x4d7/0x5f0 [btrfs]
[162513.516679] btrfs_setxattr_trans+0x3c/0x100 [btrfs]
[162513.516686] __vfs_setxattr+0x66/0x80
[162513.516691] __vfs_setxattr_noperm+0x70/0x200
[162513.516697] vfs_setxattr+0x6b/0x120
[162513.516703] setxattr+0x125/0x240
[162513.516709] ? lock_acquire+0xb1/0x480
[162513.516712] ? mnt_want_write+0x20/0x50
[162513.516721] ? rcu_read_lock_any_held+0x8e/0xb0
[162513.516723] ? preempt_count_add+0x49/0xa0
[162513.516725] ? __sb_start_write+0x19b/0x290
[162513.516727] ? preempt_count_add+0x49/0xa0
[162513.516732] path_setxattr+0xba/0xd0
[162513.516739] __x64_sys_setxattr+0x27/0x30
[162513.516741] do_syscall_64+0x33/0x80
[162513.516743] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.516745] RIP: 0033:0x7f5238f56d5a
[162513.516746] Code: Bad RIP value.
[162513.516748] RSP: 002b:00007fff67b97868 EFLAGS: 00000202 ORIG_RAX: 00000000000000bc
[162513.516750] RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007f5238f56d5a
[162513.516751] RDX: 000055b1fbb0d5a0 RSI: 00007fff67b978a0 RDI: 000055b1fbb0d470
[162513.516753] RBP: 000055b1fbb0d5a0 R08: 0000000000000001 R09: 00007fff67b97700
[162513.516754] R10: 0000000000000004 R11: 0000000000000202 R12: 0000000000000004
[162513.516756] R13: 0000000000000024 R14: 0000000000000001 R15: 00007fff67b978a0
[162513.516767] INFO: task fsstress:1356196 blocked for more than 120 seconds.
[162513.517064] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.517365] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.517763] task:fsstress state:D stack: 0 pid:1356196 ppid:1356177 flags:0x00004000
[162513.517780] Call Trace:
[162513.517786] __schedule+0x5ce/0xd00
[162513.517789] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.517796] schedule+0x46/0xf0
[162513.517810] wait_current_trans+0xde/0x140 [btrfs]
[162513.517814] ? finish_wait+0x90/0x90
[162513.517829] start_transaction+0x37c/0x5f0 [btrfs]
[162513.517845] btrfs_attach_transaction_barrier+0x1f/0x50 [btrfs]
[162513.517857] btrfs_sync_fs+0x61/0x1c0 [btrfs]
[162513.517862] ? __ia32_sys_fdatasync+0x20/0x20
[162513.517865] iterate_supers+0x87/0xf0
[162513.517869] ksys_sync+0x60/0xb0
[162513.517872] __do_sys_sync+0xa/0x10
[162513.517875] do_syscall_64+0x33/0x80
[162513.517878] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.517881] RIP: 0033:0x7f5238f50bd7
[162513.517883] Code: Bad RIP value.
[162513.517885] RSP: 002b:00007fff67b978e8 EFLAGS: 00000206 ORIG_RAX: 00000000000000a2
[162513.517887] RAX: ffffffffffffffda RBX: 000055b1fad2c560 RCX: 00007f5238f50bd7
[162513.517889] RDX: 0000000000000000 RSI: 000000007660add2 RDI: 0000000000000053
[162513.517891] RBP: 0000000000000032 R08: 0000000000000067 R09: 00007f5239019be0
[162513.517893] R10: fffffffffffff24f R11: 0000000000000206 R12: 0000000000000053
[162513.517895] R13: 00007fff67b97950 R14: 00007fff67b97906 R15: 000055b1fad1a340
[162513.517908] INFO: task fsstress:1356197 blocked for more than 120 seconds.
[162513.518298] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.518672] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.519157] task:fsstress state:D stack: 0 pid:1356197 ppid:1356177 flags:0x00000000
[162513.519160] Call Trace:
[162513.519165] __schedule+0x5ce/0xd00
[162513.519168] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.519174] schedule+0x46/0xf0
[162513.519190] wait_current_trans+0xde/0x140 [btrfs]
[162513.519193] ? finish_wait+0x90/0x90
[162513.519206] start_transaction+0x4d7/0x5f0 [btrfs]
[162513.519222] btrfs_create+0x57/0x200 [btrfs]
[162513.519230] lookup_open+0x522/0x650
[162513.519246] path_openat+0x2b8/0xa50
[162513.519270] do_filp_open+0x91/0x100
[162513.519275] ? find_held_lock+0x32/0x90
[162513.519280] ? lock_acquired+0x33b/0x470
[162513.519285] ? do_raw_spin_unlock+0x4b/0xc0
[162513.519287] ? _raw_spin_unlock+0x29/0x40
[162513.519295] do_sys_openat2+0x20d/0x2d0
[162513.519300] do_sys_open+0x44/0x80
[162513.519304] do_syscall_64+0x33/0x80
[162513.519307] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.519309] RIP: 0033:0x7f5238f4a903
[162513.519310] Code: Bad RIP value.
[162513.519312] RSP: 002b:00007fff67b97758 EFLAGS: 00000246 ORIG_RAX: 0000000000000055
[162513.519314] RAX: ffffffffffffffda RBX: 00000000ffffffff RCX: 00007f5238f4a903
[162513.519316] RDX: 0000000000000000 RSI: 00000000000001b6 RDI: 000055b1fbb0d470
[162513.519317] RBP: 00007fff67b978c0 R08: 0000000000000001 R09: 0000000000000002
[162513.519319] R10: 00007fff67b974f7 R11: 0000000000000246 R12: 0000000000000013
[162513.519320] R13: 00000000000001b6 R14: 00007fff67b97906 R15: 000055b1fad1c620
[162513.519332] INFO: task btrfs:1356211 blocked for more than 120 seconds.
[162513.519727] Not tainted 5.9.0-rc6-btrfs-next-69 #1
[162513.520115] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[162513.520508] task:btrfs state:D stack: 0 pid:1356211 ppid:1356178 flags:0x00004002
[162513.520511] Call Trace:
[162513.520516] __schedule+0x5ce/0xd00
[162513.520519] ? _raw_spin_unlock_irqrestore+0x3c/0x60
[162513.520525] schedule+0x46/0xf0
[162513.520544] btrfs_scrub_pause+0x11f/0x180 [btrfs]
[162513.520548] ? finish_wait+0x90/0x90
[162513.520562] btrfs_commit_transaction+0x45a/0xc30 [btrfs]
[162513.520574] ? start_transaction+0xe0/0x5f0 [btrfs]
[162513.520596] btrfs_dev_replace_finishing+0x6d8/0x711 [btrfs]
[162513.520619] btrfs_dev_replace_by_ioctl.cold+0x1cc/0x1fd [btrfs]
[162513.520639] btrfs_ioctl+0x2a25/0x36f0 [btrfs]
[162513.520643] ? do_sigaction+0xf3/0x240
[162513.520645] ? find_held_lock+0x32/0x90
[162513.520648] ? do_sigaction+0xf3/0x240
[162513.520651] ? lock_acquired+0x33b/0x470
[162513.520655] ? _raw_spin_unlock_irq+0x24/0x50
[162513.520657] ? lockdep_hardirqs_on+0x7d/0x100
[162513.520660] ? _raw_spin_unlock_irq+0x35/0x50
[162513.520662] ? do_sigaction+0xf3/0x240
[162513.520671] ? __x64_sys_ioctl+0x83/0xb0
[162513.520672] __x64_sys_ioctl+0x83/0xb0
[162513.520677] do_syscall_64+0x33/0x80
[162513.520679] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[162513.520681] RIP: 0033:0x7fc3cd307d87
[162513.520682] Code: Bad RIP value.
[162513.520684] RSP: 002b:00007ffe30a56bb8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
[162513.520686] RAX: ffffffffffffffda RBX: 0000000000000004 RCX: 00007fc3cd307d87
[162513.520687] RDX: 00007ffe30a57a30 RSI: 00000000ca289435 RDI: 0000000000000003
[162513.520689] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[162513.520690] R10: 0000000000000008 R11: 0000000000000202 R12: 0000000000000003
[162513.520692] R13: 0000557323a212e0 R14: 00007ffe30a5a520 R15: 0000000000000001
[162513.520703]
Showing all locks held in the system:
[162513.520712] 1 lock held by khungtaskd/54:
[162513.520713] #0: ffffffffb40a91a0 (rcu_read_lock){....}-{1:2}, at: debug_show_all_locks+0x15/0x197
[162513.520728] 1 lock held by in:imklog/596:
[162513.520729] #0: ffff8f3f0d781400 (&f->f_pos_lock){+.+.}-{3:3}, at: __fdget_pos+0x4d/0x60
[162513.520782] 1 lock held by btrfs-transacti/1356167:
[162513.520784] #0: ffff8f3d810cc848 (&fs_info->transaction_kthread_mutex){+.+.}-{3:3}, at: transaction_kthread+0x4a/0x170 [btrfs]
[162513.520798] 1 lock held by btrfs/1356190:
[162513.520800] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write_file+0x22/0x60
[162513.520805] 1 lock held by fsstress/1356184:
[162513.520806] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
[162513.520811] 3 locks held by fsstress/1356185:
[162513.520812] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
[162513.520815] #1: ffff8f3d80a650b8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: vfs_setxattr+0x50/0x120
[162513.520820] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
[162513.520833] 1 lock held by fsstress/1356196:
[162513.520834] #0: ffff8f3d576440e8 (&type->s_umount_key#62){++++}-{3:3}, at: iterate_supers+0x6f/0xf0
[162513.520838] 3 locks held by fsstress/1356197:
[162513.520839] #0: ffff8f3d57644470 (sb_writers#15){.+.+}-{0:0}, at: mnt_want_write+0x20/0x50
[162513.520843] #1: ffff8f3d506465e8 (&type->i_mutex_dir_key#10){++++}-{3:3}, at: path_openat+0x2a7/0xa50
[162513.520846] #2: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
[162513.520858] 2 locks held by btrfs/1356211:
[162513.520859] #0: ffff8f3d810cde30 (&fs_info->dev_replace.lock_finishing_cancel_unmount){+.+.}-{3:3}, at: btrfs_dev_replace_finishing+0x52/0x711 [btrfs]
[162513.520877] #1: ffff8f3d57644690 (sb_internal#2){.+.+}-{0:0}, at: start_transaction+0x40e/0x5f0 [btrfs]
This was weird because the stack traces show that a transaction commit,
triggered by a device replace operation, is blocking trying to pause any
running scrubs but there are no stack traces of blocked tasks doing a
scrub.
After poking around with drgn, I noticed there was a scrub task that was
constantly running and blocking for shorts periods of time:
>>> t = find_task(prog, 1356190)
>>> prog.stack_trace(t)
#0 __schedule+0x5ce/0xcfc
#1 schedule+0x46/0xe4
#2 schedule_timeout+0x1df/0x475
#3 btrfs_reada_wait+0xda/0x132
#4 scrub_stripe+0x2a8/0x112f
#5 scrub_chunk+0xcd/0x134
#6 scrub_enumerate_chunks+0x29e/0x5ee
#7 btrfs_scrub_dev+0x2d5/0x91b
#8 btrfs_ioctl+0x7f5/0x36e7
#9 __x64_sys_ioctl+0x83/0xb0
#10 do_syscall_64+0x33/0x77
#11 entry_SYSCALL_64+0x7c/0x156
Which corresponds to:
int btrfs_reada_wait(void *handle)
{
struct reada_control *rc = handle;
struct btrfs_fs_info *fs_info = rc->fs_info;
while (atomic_read(&rc->elems)) {
if (!atomic_read(&fs_info->reada_works_cnt))
reada_start_machine(fs_info);
wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
(HZ + 9) / 10);
}
(...)
So the counter "rc->elems" was set to 1 and never decreased to 0, causing
the scrub task to loop forever in that function. Then I used the following
script for drgn to check the readahead requests:
$ cat dump_reada.py
import sys
import drgn
from drgn import NULL, Object, cast, container_of, execscript, \
reinterpret, sizeof
from drgn.helpers.linux import *
mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
mnt = None
for mnt in for_each_mount(prog, dst = mnt_path):
pass
if mnt is None:
sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
sys.exit(1)
fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
def dump_re(re):
nzones = re.nzones.value_()
print(f're at {hex(re.value_())}')
print(f'\t logical {re.logical.value_()}')
print(f'\t refcnt {re.refcnt.value_()}')
print(f'\t nzones {nzones}')
for i in range(nzones):
dev = re.zones[i].device
name = dev.name.str.string_()
print(f'\t\t dev id {dev.devid.value_()} name {name}')
print()
for _, e in radix_tree_for_each(fs_info.reada_tree):
re = cast('struct reada_extent *', e)
dump_re(re)
$ drgn dump_reada.py
re at 0xffff8f3da9d25ad8
logical 38928384
refcnt 1
nzones 1
dev id 0 name b'/dev/sdd'
$
So there was one readahead extent with a single zone corresponding to the
source device of that last device replace operation logged in dmesg/syslog.
Also the ID of that zone's device was 0 which is a special value set in
the source device of a device replace operation when the operation finishes
(constant BTRFS_DEV_REPLACE_DEVID set at btrfs_dev_replace_finishing()),
confirming again that device /dev/sdd was the source of a device replace
operation.
Normally there should be as many zones in the readahead extent as there are
devices, and I wasn't expecting the extent to be in a block group with a
'single' profile, so I went and confirmed with the following drgn script
that there weren't any single profile block groups:
$ cat dump_block_groups.py
import sys
import drgn
from drgn import NULL, Object, cast, container_of, execscript, \
reinterpret, sizeof
from drgn.helpers.linux import *
mnt_path = b"/home/fdmanana/btrfs-tests/scratch_1"
mnt = None
for mnt in for_each_mount(prog, dst = mnt_path):
pass
if mnt is None:
sys.stderr.write(f'Error: mount point {mnt_path} not found\n')
sys.exit(1)
fs_info = cast('struct btrfs_fs_info *', mnt.mnt.mnt_sb.s_fs_info)
BTRFS_BLOCK_GROUP_DATA = (1 << 0)
BTRFS_BLOCK_GROUP_SYSTEM = (1 << 1)
BTRFS_BLOCK_GROUP_METADATA = (1 << 2)
BTRFS_BLOCK_GROUP_RAID0 = (1 << 3)
BTRFS_BLOCK_GROUP_RAID1 = (1 << 4)
BTRFS_BLOCK_GROUP_DUP = (1 << 5)
BTRFS_BLOCK_GROUP_RAID10 = (1 << 6)
BTRFS_BLOCK_GROUP_RAID5 = (1 << 7)
BTRFS_BLOCK_GROUP_RAID6 = (1 << 8)
BTRFS_BLOCK_GROUP_RAID1C3 = (1 << 9)
BTRFS_BLOCK_GROUP_RAID1C4 = (1 << 10)
def bg_flags_string(bg):
flags = bg.flags.value_()
ret = ''
if flags & BTRFS_BLOCK_GROUP_DATA:
ret = 'data'
if flags & BTRFS_BLOCK_GROUP_METADATA:
if len(ret) > 0:
ret += '|'
ret += 'meta'
if flags & BTRFS_BLOCK_GROUP_SYSTEM:
if len(ret) > 0:
ret += '|'
ret += 'system'
if flags & BTRFS_BLOCK_GROUP_RAID0:
ret += ' raid0'
elif flags & BTRFS_BLOCK_GROUP_RAID1:
ret += ' raid1'
elif flags & BTRFS_BLOCK_GROUP_DUP:
ret += ' dup'
elif flags & BTRFS_BLOCK_GROUP_RAID10:
ret += ' raid10'
elif flags & BTRFS_BLOCK_GROUP_RAID5:
ret += ' raid5'
elif flags & BTRFS_BLOCK_GROUP_RAID6:
ret += ' raid6'
elif flags & BTRFS_BLOCK_GROUP_RAID1C3:
ret += ' raid1c3'
elif flags & BTRFS_BLOCK_GROUP_RAID1C4:
ret += ' raid1c4'
else:
ret += ' single'
return ret
def dump_bg(bg):
print()
print(f'block group at {hex(bg.value_())}')
print(f'\t start {bg.start.value_()} length {bg.length.value_()}')
print(f'\t flags {bg.flags.value_()} - {bg_flags_string(bg)}')
bg_root = fs_info.block_group_cache_tree.address_of_()
for bg in rbtree_inorder_for_each_entry('struct btrfs_block_group', bg_root, 'cache_node'):
dump_bg(bg)
$ drgn dump_block_groups.py
block group at 0xffff8f3d673b0400
start 22020096 length 16777216
flags 258 - system raid6
block group at 0xffff8f3d53ddb400
start 38797312 length 536870912
flags 260 - meta raid6
block group at 0xffff8f3d5f4d9c00
start 575668224 length 2147483648
flags 257 - data raid6
block group at 0xffff8f3d08189000
start 2723151872 length 67108864
flags 258 - system raid6
block group at 0xffff8f3db70ff000
start 2790260736 length 1073741824
flags 260 - meta raid6
block group at 0xffff8f3d5f4dd800
start 3864002560 length 67108864
flags 258 - system raid6
block group at 0xffff8f3d67037000
start 3931111424 length 2147483648
flags 257 - data raid6
$
So there were only 2 reasons left for having a readahead extent with a
single zone: reada_find_zone(), called when creating a readahead extent,
returned NULL either because we failed to find the corresponding block
group or because a memory allocation failed. With some additional and
custom tracing I figured out that on every further ocurrence of the
problem the block group had just been deleted when we were looping to
create the zones for the readahead extent (at reada_find_extent()), so we
ended up with only one zone in the readahead extent, corresponding to a
device that ends up getting replaced.
So after figuring that out it became obvious why the hang happens:
1) Task A starts a scrub on any device of the filesystem, except for
device /dev/sdd;
2) Task B starts a device replace with /dev/sdd as the source device;
3) Task A calls btrfs_reada_add() from scrub_stripe() and it is currently
starting to scrub a stripe from block group X. This call to
btrfs_reada_add() is the one for the extent tree. When btrfs_reada_add()
calls reada_add_block(), it passes the logical address of the extent
tree's root node as its 'logical' argument - a value of 38928384;
4) Task A then enters reada_find_extent(), called from reada_add_block().
It finds there isn't any existing readahead extent for the logical
address 38928384, so it proceeds to the path of creating a new one.
It calls btrfs_map_block() to find out which stripes exist for the block
group X. On the first iteration of the for loop that iterates over the
stripes, it finds the stripe for device /dev/sdd, so it creates one
zone for that device and adds it to the readahead extent. Before getting
into the second iteration of the loop, the cleanup kthread deletes block
group X because it was empty. So in the iterations for the remaining
stripes it does not add more zones to the readahead extent, because the
calls to reada_find_zone() returned NULL because they couldn't find
block group X anymore.
As a result the new readahead extent has a single zone, corresponding to
the device /dev/sdd;
4) Before task A returns to btrfs_reada_add() and queues the readahead job
for the readahead work queue, task B finishes the device replace and at
btrfs_dev_replace_finishing() swaps the device /dev/sdd with the new
device /dev/sdg;
5) Task A returns to reada_add_block(), which increments the counter
"->elems" of the reada_control structure allocated at btrfs_reada_add().
Then it returns back to btrfs_reada_add() and calls
reada_start_machine(). This queues a job in the readahead work queue to
run the function reada_start_machine_worker(), which calls
__reada_start_machine().
At __reada_start_machine() we take the device list mutex and for each
device found in the current device list, we call
reada_start_machine_dev() to start the readahead work. However at this
point the device /dev/sdd was already freed and is not in the device
list anymore.
This means the corresponding readahead for the extent at 38928384 is
never started, and therefore the "->elems" counter of the reada_control
structure allocated at btrfs_reada_add() never goes down to 0, causing
the call to btrfs_reada_wait(), done by the scrub task, to wait forever.
Note that the readahead request can be made either after the device replace
started or before it started, however in pratice it is very unlikely that a
device replace is able to start after a readahead request is made and is
able to complete before the readahead request completes - maybe only on a
very small and nearly empty filesystem.
This hang however is not the only problem we can have with readahead and
device removals. When the readahead extent has other zones other than the
one corresponding to the device that is being removed (either by a device
replace or a device remove operation), we risk having a use-after-free on
the device when dropping the last reference of the readahead extent.
For example if we create a readahead extent with two zones, one for the
device /dev/sdd and one for the device /dev/sde:
1) Before the readahead worker starts, the device /dev/sdd is removed,
and the corresponding btrfs_device structure is freed. However the
readahead extent still has the zone pointing to the device structure;
2) When the readahead worker starts, it only finds device /dev/sde in the
current device list of the filesystem;
3) It starts the readahead work, at reada_start_machine_dev(), using the
device /dev/sde;
4) Then when it finishes reading the extent from device /dev/sde, it calls
__readahead_hook() which ends up dropping the last reference on the
readahead extent through the last call to reada_extent_put();
5) At reada_extent_put() it iterates over each zone of the readahead extent
and attempts to delete an element from the device's 'reada_extents'
radix tree, resulting in a use-after-free, as the device pointer of the
zone for /dev/sdd is now stale. We can also access the device after
dropping the last reference of a zone, through reada_zone_release(),
also called by reada_extent_put().
And a device remove suffers the same problem, however since it shrinks the
device size down to zero before removing the device, it is very unlikely to
still have readahead requests not completed by the time we free the device,
the only possibility is if the device has a very little space allocated.
While the hang problem is exclusive to scrub, since it is currently the
only user of btrfs_reada_add() and btrfs_reada_wait(), the use-after-free
problem affects any path that triggers readhead, which includes
btree_readahead_hook() and __readahead_hook() (a readahead worker can
trigger readahed for the children of a node) for example - any path that
ends up calling reada_add_block() can trigger the use-after-free after a
device is removed.
So fix this by waiting for any readahead requests for a device to complete
before removing a device, ensuring that while waiting for existing ones no
new ones can be made.
This problem has been around for a very long time - the readahead code was
added in 2011, device remove exists since 2008 and device replace was
introduced in 2013, hard to pick a specific commit for a git Fixes tag.
CC: stable@vger.kernel.org # 4.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-10-12 18:55:24 +08:00
|
|
|
btrfs_reada_undo_remove_dev(src_device);
|
btrfs: Wait for in-flight bios before freeing target device for raid56
When raid56 dev-replace is cancelled by running scrub, we will free
target device without waiting for in-flight bios, causing the following
NULL pointer deference or general protection failure.
BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
IP: generic_make_request_checks+0x4d/0x610
CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
task: ffff88002875b4c0 task.stack: ffffc90001334000
RIP: 0010:generic_make_request_checks+0x4d/0x610
Call Trace:
? generic_make_request+0xc7/0x360
generic_make_request+0x24/0x360
? generic_make_request+0xc7/0x360
submit_bio+0x64/0x120
? page_in_rbio+0x4d/0x80 [btrfs]
? rbio_orig_end_io+0x80/0x80 [btrfs]
finish_rmw+0x3f4/0x540 [btrfs]
validate_rbio_for_rmw+0x36/0x40 [btrfs]
raid_rmw_end_io+0x7a/0x90 [btrfs]
bio_endio+0x56/0x60
end_workqueue_fn+0x3c/0x40 [btrfs]
btrfs_scrubparity_helper+0xef/0x620 [btrfs]
btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8
In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.
However when dev-replace is aborted, either due to error or cancelled by
scrub, we didn't wait for bios, this can lead to use-after-free if there
are bios holding the target device.
Furthermore, for raid56 scrub, at least 2 places are calling
btrfs_map_sblock() without protection of bio_counter, leading to the
problem.
This patch fixes the problem:
1) Wait for bio_counter before freeing target device when canceling
replace
2) When calling btrfs_map_sblock() for raid56, use bio_counter to
protect the call.
Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 09:33:21 +08:00
|
|
|
btrfs_rm_dev_replace_blocked(fs_info);
|
2012-11-06 00:33:06 +08:00
|
|
|
if (tgt_device)
|
2018-07-21 00:37:51 +08:00
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
btrfs: Wait for in-flight bios before freeing target device for raid56
When raid56 dev-replace is cancelled by running scrub, we will free
target device without waiting for in-flight bios, causing the following
NULL pointer deference or general protection failure.
BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0
IP: generic_make_request_checks+0x4d/0x610
CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72
Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014
Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs]
task: ffff88002875b4c0 task.stack: ffffc90001334000
RIP: 0010:generic_make_request_checks+0x4d/0x610
Call Trace:
? generic_make_request+0xc7/0x360
generic_make_request+0x24/0x360
? generic_make_request+0xc7/0x360
submit_bio+0x64/0x120
? page_in_rbio+0x4d/0x80 [btrfs]
? rbio_orig_end_io+0x80/0x80 [btrfs]
finish_rmw+0x3f4/0x540 [btrfs]
validate_rbio_for_rmw+0x36/0x40 [btrfs]
raid_rmw_end_io+0x7a/0x90 [btrfs]
bio_endio+0x56/0x60
end_workqueue_fn+0x3c/0x40 [btrfs]
btrfs_scrubparity_helper+0xef/0x620 [btrfs]
btrfs_endio_raid56_helper+0xe/0x10 [btrfs]
process_one_work+0x2af/0x720
? process_one_work+0x22b/0x720
worker_thread+0x4b/0x4f0
kthread+0x10f/0x150
? process_one_work+0x720/0x720
? kthread_create_on_node+0x40/0x40
ret_from_fork+0x2e/0x40
RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8
In btrfs_dev_replace_finishing(), we will call
btrfs_rm_dev_replace_blocked() to wait bios before destroying the target
device when scrub is finished normally.
However when dev-replace is aborted, either due to error or cancelled by
scrub, we didn't wait for bios, this can lead to use-after-free if there
are bios holding the target device.
Furthermore, for raid56 scrub, at least 2 places are calling
btrfs_map_sblock() without protection of bio_counter, leading to the
problem.
This patch fixes the problem:
1) Wait for bio_counter before freeing target device when canceling
replace
2) When calling btrfs_map_sblock() for raid56, use bio_counter to
protect the call.
Cc: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 09:33:21 +08:00
|
|
|
btrfs_rm_dev_replace_unblocked(fs_info);
|
2012-11-06 00:33:06 +08:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
2014-10-13 12:42:12 +08:00
|
|
|
return scrub_ret;
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
|
2016-06-23 06:54:23 +08:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"dev_replace from %s (devid %llu) to %s finished",
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(src_device),
|
2016-06-23 06:54:23 +08:00
|
|
|
src_device->devid,
|
|
|
|
rcu_str_deref(tgt_device->name));
|
2017-12-04 12:54:55 +08:00
|
|
|
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
|
2012-11-06 00:33:06 +08:00
|
|
|
tgt_device->devid = src_device->devid;
|
|
|
|
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
|
|
|
|
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
|
|
|
|
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
|
|
|
|
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
|
2014-09-03 21:35:38 +08:00
|
|
|
btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
|
|
|
|
btrfs_device_set_disk_total_bytes(tgt_device,
|
|
|
|
src_device->disk_total_bytes);
|
|
|
|
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
|
2014-09-03 21:35:34 +08:00
|
|
|
tgt_device->commit_bytes_used = src_device->bytes_used;
|
2016-05-03 17:44:43 +08:00
|
|
|
|
2018-07-21 00:37:50 +08:00
|
|
|
btrfs_assign_next_active_device(src_device, tgt_device);
|
2016-05-03 17:44:43 +08:00
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
|
2014-09-03 21:35:44 +08:00
|
|
|
fs_info->fs_devices->rw_devices++;
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_rm_dev_replace_blocked(fs_info);
|
|
|
|
|
2018-07-21 00:37:48 +08:00
|
|
|
btrfs_rm_dev_replace_remove_srcdev(src_device);
|
2013-10-03 01:41:01 +08:00
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_rm_dev_replace_unblocked(fs_info);
|
|
|
|
|
2018-07-31 15:20:21 +08:00
|
|
|
/*
|
|
|
|
* Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
|
|
|
|
* update on-disk dev stats value during commit transaction
|
|
|
|
*/
|
|
|
|
atomic_inc(&tgt_device->dev_stats_ccnt);
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
/*
|
|
|
|
* this is again a consistent state where no dev_replace procedure
|
|
|
|
* is running, the target device is part of the filesystem, the
|
|
|
|
* source device is not part of the filesystem anymore and its 1st
|
|
|
|
* superblock is scratched out so that it is no longer marked to
|
|
|
|
* belong to this filesystem.
|
|
|
|
*/
|
2016-06-23 06:54:23 +08:00
|
|
|
mutex_unlock(&fs_info->chunk_mutex);
|
|
|
|
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2014-10-30 16:52:31 +08:00
|
|
|
/* replace the sysfs entry */
|
2020-09-05 01:34:27 +08:00
|
|
|
btrfs_sysfs_remove_device(src_device);
|
2020-01-06 19:38:31 +08:00
|
|
|
btrfs_sysfs_update_devid(tgt_device);
|
2020-08-20 23:18:26 +08:00
|
|
|
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
|
|
|
|
btrfs_scratch_superblocks(fs_info, src_device->bdev,
|
|
|
|
src_device->name->str);
|
2014-10-30 16:52:31 +08:00
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
/* write back the superblocks */
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (!IS_ERR(trans))
|
2016-09-10 09:39:03 +08:00
|
|
|
btrfs_commit_transaction(trans);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
|
btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks
When closing and freeing the source device we could end up doing our
final blkdev_put() on the bdev, which will grab the bd_mutex. As such
we want to be holding as few locks as possible, so move this call
outside of the dev_replace->lock_finishing_cancel_unmount lock. Since
we're modifying the fs_devices we need to make sure we're holding the
uuid_mutex here, so take that as well.
There's a report from syzbot probably hitting one of the cases where
the bd_mutex and device_list_mutex are taken in the wrong order, however
it's not with device replace, like this patch fixes. As there's no
reproducer available so far, we can't verify the fix.
https://lore.kernel.org/lkml/000000000000fc04d105afcf86d7@google.com/
dashboard link: https://syzkaller.appspot.com/bug?extid=84a0634dc5d21d488419
WARNING: possible circular locking dependency detected
5.9.0-rc5-syzkaller #0 Not tainted
------------------------------------------------------
syz-executor.0/6878 is trying to acquire lock:
ffff88804c17d780 (&bdev->bd_mutex){+.+.}-{3:3}, at: blkdev_put+0x30/0x520 fs/block_dev.c:1804
but task is already holding lock:
ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159
which lock already depends on the new lock.
the existing dependency chain (in reverse order) is:
-> #4 (&fs_devs->device_list_mutex){+.+.}-{3:3}:
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
btrfs_finish_chunk_alloc+0x281/0xf90 fs/btrfs/volumes.c:5255
btrfs_create_pending_block_groups+0x2f3/0x700 fs/btrfs/block-group.c:2109
__btrfs_end_transaction+0xf5/0x690 fs/btrfs/transaction.c:916
find_free_extent_update_loop fs/btrfs/extent-tree.c:3807 [inline]
find_free_extent+0x23b7/0x2e60 fs/btrfs/extent-tree.c:4127
btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206
cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063
btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838
writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439
__extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653
extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249
extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370
do_writepages+0xec/0x290 mm/page-writeback.c:2352
__writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461
writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721
wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894
wb_do_writeback fs/fs-writeback.c:2039 [inline]
wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080
process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
kthread+0x3b5/0x4a0 kernel/kthread.c:292
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
-> #3 (sb_internal#2){.+.+}-{0:0}:
percpu_down_read include/linux/percpu-rwsem.h:51 [inline]
__sb_start_write+0x234/0x470 fs/super.c:1672
sb_start_intwrite include/linux/fs.h:1690 [inline]
start_transaction+0xbe7/0x1170 fs/btrfs/transaction.c:624
find_free_extent_update_loop fs/btrfs/extent-tree.c:3789 [inline]
find_free_extent+0x25e1/0x2e60 fs/btrfs/extent-tree.c:4127
btrfs_reserve_extent+0x166/0x460 fs/btrfs/extent-tree.c:4206
cow_file_range+0x3de/0x9b0 fs/btrfs/inode.c:1063
btrfs_run_delalloc_range+0x2cf/0x1410 fs/btrfs/inode.c:1838
writepage_delalloc+0x150/0x460 fs/btrfs/extent_io.c:3439
__extent_writepage+0x441/0xd00 fs/btrfs/extent_io.c:3653
extent_write_cache_pages.constprop.0+0x69d/0x1040 fs/btrfs/extent_io.c:4249
extent_writepages+0xcd/0x2b0 fs/btrfs/extent_io.c:4370
do_writepages+0xec/0x290 mm/page-writeback.c:2352
__writeback_single_inode+0x125/0x1400 fs/fs-writeback.c:1461
writeback_sb_inodes+0x53d/0xf40 fs/fs-writeback.c:1721
wb_writeback+0x2ad/0xd40 fs/fs-writeback.c:1894
wb_do_writeback fs/fs-writeback.c:2039 [inline]
wb_workfn+0x2dc/0x13e0 fs/fs-writeback.c:2080
process_one_work+0x94c/0x1670 kernel/workqueue.c:2269
worker_thread+0x64c/0x1120 kernel/workqueue.c:2415
kthread+0x3b5/0x4a0 kernel/kthread.c:292
ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:294
-> #2 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}:
__flush_work+0x60e/0xac0 kernel/workqueue.c:3041
wb_shutdown+0x180/0x220 mm/backing-dev.c:355
bdi_unregister+0x174/0x590 mm/backing-dev.c:872
del_gendisk+0x820/0xa10 block/genhd.c:933
loop_remove drivers/block/loop.c:2192 [inline]
loop_control_ioctl drivers/block/loop.c:2291 [inline]
loop_control_ioctl+0x3b1/0x480 drivers/block/loop.c:2257
vfs_ioctl fs/ioctl.c:48 [inline]
__do_sys_ioctl fs/ioctl.c:753 [inline]
__se_sys_ioctl fs/ioctl.c:739 [inline]
__x64_sys_ioctl+0x193/0x200 fs/ioctl.c:739
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #1 (loop_ctl_mutex){+.+.}-{3:3}:
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
lo_open+0x19/0xd0 drivers/block/loop.c:1893
__blkdev_get+0x759/0x1aa0 fs/block_dev.c:1507
blkdev_get fs/block_dev.c:1639 [inline]
blkdev_open+0x227/0x300 fs/block_dev.c:1753
do_dentry_open+0x4b9/0x11b0 fs/open.c:817
do_open fs/namei.c:3251 [inline]
path_openat+0x1b9a/0x2730 fs/namei.c:3368
do_filp_open+0x17e/0x3c0 fs/namei.c:3395
do_sys_openat2+0x16d/0x420 fs/open.c:1168
do_sys_open fs/open.c:1184 [inline]
__do_sys_open fs/open.c:1192 [inline]
__se_sys_open fs/open.c:1188 [inline]
__x64_sys_open+0x119/0x1c0 fs/open.c:1188
do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46
entry_SYSCALL_64_after_hwframe+0x44/0xa9
-> #0 (&bdev->bd_mutex){+.+.}-{3:3}:
check_prev_add kernel/locking/lockdep.c:2496 [inline]
check_prevs_add kernel/locking/lockdep.c:2601 [inline]
validate_chain kernel/locking/lockdep.c:3218 [inline]
__lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426
lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
blkdev_put+0x30/0x520 fs/block_dev.c:1804
btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline]
btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline]
btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline]
close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161
close_fs_devices fs/btrfs/volumes.c:1193 [inline]
btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179
close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149
generic_shutdown_super+0x144/0x370 fs/super.c:464
kill_anon_super+0x36/0x60 fs/super.c:1108
btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265
deactivate_locked_super+0x94/0x160 fs/super.c:335
deactivate_super+0xad/0xd0 fs/super.c:366
cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118
task_work_run+0xdd/0x190 kernel/task_work.c:141
tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_user_mode_loop kernel/entry/common.c:163 [inline]
exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190
syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265
entry_SYSCALL_64_after_hwframe+0x44/0xa9
other info that might help us debug this:
Chain exists of:
&bdev->bd_mutex --> sb_internal#2 --> &fs_devs->device_list_mutex
Possible unsafe locking scenario:
CPU0 CPU1
---- ----
lock(&fs_devs->device_list_mutex);
lock(sb_internal#2);
lock(&fs_devs->device_list_mutex);
lock(&bdev->bd_mutex);
*** DEADLOCK ***
3 locks held by syz-executor.0/6878:
#0: ffff88809070c0e0 (&type->s_umount_key#70){++++}-{3:3}, at: deactivate_super+0xa5/0xd0 fs/super.c:365
#1: ffffffff8a5b37a8 (uuid_mutex){+.+.}-{3:3}, at: btrfs_close_devices+0x23/0x1f0 fs/btrfs/volumes.c:1178
#2: ffff8880908cfce0 (&fs_devs->device_list_mutex){+.+.}-{3:3}, at: close_fs_devices.part.0+0x2e/0x800 fs/btrfs/volumes.c:1159
stack backtrace:
CPU: 0 PID: 6878 Comm: syz-executor.0 Not tainted 5.9.0-rc5-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
Call Trace:
__dump_stack lib/dump_stack.c:77 [inline]
dump_stack+0x198/0x1fd lib/dump_stack.c:118
check_noncircular+0x324/0x3e0 kernel/locking/lockdep.c:1827
check_prev_add kernel/locking/lockdep.c:2496 [inline]
check_prevs_add kernel/locking/lockdep.c:2601 [inline]
validate_chain kernel/locking/lockdep.c:3218 [inline]
__lock_acquire+0x2a96/0x5780 kernel/locking/lockdep.c:4426
lock_acquire+0x1f3/0xae0 kernel/locking/lockdep.c:5006
__mutex_lock_common kernel/locking/mutex.c:956 [inline]
__mutex_lock+0x134/0x10e0 kernel/locking/mutex.c:1103
blkdev_put+0x30/0x520 fs/block_dev.c:1804
btrfs_close_bdev fs/btrfs/volumes.c:1117 [inline]
btrfs_close_bdev fs/btrfs/volumes.c:1107 [inline]
btrfs_close_one_device fs/btrfs/volumes.c:1133 [inline]
close_fs_devices.part.0+0x1a4/0x800 fs/btrfs/volumes.c:1161
close_fs_devices fs/btrfs/volumes.c:1193 [inline]
btrfs_close_devices+0x95/0x1f0 fs/btrfs/volumes.c:1179
close_ctree+0x688/0x6cb fs/btrfs/disk-io.c:4149
generic_shutdown_super+0x144/0x370 fs/super.c:464
kill_anon_super+0x36/0x60 fs/super.c:1108
btrfs_kill_super+0x38/0x50 fs/btrfs/super.c:2265
deactivate_locked_super+0x94/0x160 fs/super.c:335
deactivate_super+0xad/0xd0 fs/super.c:366
cleanup_mnt+0x3a3/0x530 fs/namespace.c:1118
task_work_run+0xdd/0x190 kernel/task_work.c:141
tracehook_notify_resume include/linux/tracehook.h:188 [inline]
exit_to_user_mode_loop kernel/entry/common.c:163 [inline]
exit_to_user_mode_prepare+0x1e1/0x200 kernel/entry/common.c:190
syscall_exit_to_user_mode+0x7e/0x2e0 kernel/entry/common.c:265
entry_SYSCALL_64_after_hwframe+0x44/0xa9
RIP: 0033:0x460027
RSP: 002b:00007fff59216328 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 0000000000076035 RCX: 0000000000460027
RDX: 0000000000403188 RSI: 0000000000000002 RDI: 00007fff592163d0
RBP: 0000000000000333 R08: 0000000000000000 R09: 000000000000000b
R10: 0000000000000005 R11: 0000000000000246 R12: 00007fff59217460
R13: 0000000002df2a60 R14: 0000000000000000 R15: 00007fff59217460
Signed-off-by: Josef Bacik <josef@toxicpanda.com>
[ add syzbot reference ]
Signed-off-by: David Sterba <dsterba@suse.com>
2020-08-20 23:18:27 +08:00
|
|
|
btrfs_rm_dev_replace_free_srcdev(src_device);
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-14 22:24:56 +08:00
|
|
|
/*
|
|
|
|
* Read progress of device replace status according to the state and last
|
|
|
|
* stored position. The value format is the same as for
|
|
|
|
* btrfs_dev_replace::progress_1000
|
|
|
|
*/
|
|
|
|
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
u64 ret = 0;
|
|
|
|
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
ret = 1000;
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
ret = div64_u64(dev_replace->cursor_left,
|
|
|
|
div_u64(btrfs_device_get_total_bytes(
|
|
|
|
dev_replace->srcdev), 1000));
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_ioctl_dev_replace_args *args)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
/* even if !dev_replace_is_valid, the values are good enough for
|
|
|
|
* the replace_status ioctl */
|
|
|
|
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
args->status.replace_state = dev_replace->replace_state;
|
|
|
|
args->status.time_started = dev_replace->time_started;
|
|
|
|
args->status.time_stopped = dev_replace->time_stopped;
|
|
|
|
args->status.num_write_errors =
|
|
|
|
atomic64_read(&dev_replace->num_write_errors);
|
|
|
|
args->status.num_uncorrectable_read_errors =
|
|
|
|
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
|
2017-06-14 22:24:56 +08:00
|
|
|
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
|
2018-09-07 22:11:23 +08:00
|
|
|
up_read(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
|
2018-02-12 23:33:31 +08:00
|
|
|
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
|
2012-11-06 00:33:06 +08:00
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
struct btrfs_device *tgt_device = NULL;
|
2018-02-13 11:53:43 +08:00
|
|
|
struct btrfs_device *src_device = NULL;
|
2012-11-06 00:33:06 +08:00
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
struct btrfs_root *root = fs_info->tree_root;
|
2018-02-12 23:33:31 +08:00
|
|
|
int result;
|
2012-11-06 00:33:06 +08:00
|
|
|
int ret;
|
|
|
|
|
2017-07-17 15:45:34 +08:00
|
|
|
if (sb_rdonly(fs_info->sb))
|
2013-10-11 01:40:21 +08:00
|
|
|
return -EROFS;
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-14 13:50:26 +08:00
|
|
|
break;
|
2012-11-06 00:33:06 +08:00
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
2018-11-14 13:50:26 +08:00
|
|
|
tgt_device = dev_replace->tgtdev;
|
|
|
|
src_device = dev_replace->srcdev;
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-11 22:22:20 +08:00
|
|
|
ret = btrfs_scrub_cancel(fs_info);
|
|
|
|
if (ret < 0) {
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
|
|
|
|
} else {
|
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
/*
|
|
|
|
* btrfs_dev_replace_finishing() will handle the
|
|
|
|
* cleanup part
|
|
|
|
*/
|
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"dev_replace from %s (devid %llu) to %s canceled",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid,
|
|
|
|
btrfs_dev_name(tgt_device));
|
|
|
|
}
|
2018-11-14 13:50:26 +08:00
|
|
|
break;
|
2012-11-06 00:33:06 +08:00
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
2018-11-14 13:50:26 +08:00
|
|
|
/*
|
|
|
|
* Scrub doing the replace isn't running so we need to do the
|
|
|
|
* cleanup step of btrfs_dev_replace_finishing() here
|
|
|
|
*/
|
2012-11-06 00:33:06 +08:00
|
|
|
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
|
|
|
|
tgt_device = dev_replace->tgtdev;
|
2018-02-13 11:53:43 +08:00
|
|
|
src_device = dev_replace->srcdev;
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->tgtdev = NULL;
|
|
|
|
dev_replace->srcdev = NULL;
|
2018-11-14 13:50:26 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
|
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
|
|
|
dev_replace->item_needs_writeback = 1;
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-02-13 11:53:43 +08:00
|
|
|
|
2018-11-11 22:22:21 +08:00
|
|
|
/* Scrub for replace must not be running in suspended state */
|
|
|
|
ret = btrfs_scrub_cancel(fs_info);
|
|
|
|
ASSERT(ret != -ENOTCONN);
|
2018-11-14 13:50:26 +08:00
|
|
|
|
|
|
|
trans = btrfs_start_transaction(root, 0);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return PTR_ERR(trans);
|
|
|
|
}
|
|
|
|
ret = btrfs_commit_transaction(trans);
|
|
|
|
WARN_ON(ret);
|
2018-02-13 11:53:43 +08:00
|
|
|
|
2018-11-14 13:50:26 +08:00
|
|
|
btrfs_info_in_rcu(fs_info,
|
|
|
|
"suspended dev_replace from %s (devid %llu) to %s canceled",
|
|
|
|
btrfs_dev_name(src_device), src_device->devid,
|
|
|
|
btrfs_dev_name(tgt_device));
|
|
|
|
|
|
|
|
if (tgt_device)
|
|
|
|
btrfs_destroy_dev_replace_tgtdev(tgt_device);
|
|
|
|
break;
|
|
|
|
default:
|
2019-02-12 02:32:10 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-11-14 13:50:26 +08:00
|
|
|
result = -EINVAL;
|
|
|
|
}
|
2012-11-06 00:33:06 +08:00
|
|
|
|
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
|
|
|
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-06-12 19:48:25 +08:00
|
|
|
dev_replace->time_stopped = ktime_get_real_seconds();
|
2012-11-06 00:33:06 +08:00
|
|
|
dev_replace->item_needs_writeback = 1;
|
2013-12-21 00:37:06 +08:00
|
|
|
btrfs_info(fs_info, "suspending dev_replace for unmount");
|
2012-11-06 00:33:06 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* resume dev_replace procedure that was interrupted by unmount */
|
|
|
|
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
struct task_struct *task;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
return 0;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
break;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
|
2013-12-21 00:37:06 +08:00
|
|
|
btrfs_info(fs_info,
|
2016-09-20 22:05:00 +08:00
|
|
|
"cannot continue dev_replace, tgtdev is missing");
|
|
|
|
btrfs_info(fs_info,
|
|
|
|
"you may cancel the operation after 'mount -o degraded'");
|
2018-11-11 22:22:17 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2018-03-21 02:51:04 +08:00
|
|
|
/*
|
|
|
|
* This could collide with a paused balance, but the exclusive op logic
|
|
|
|
* should never allow both to start and pause. We don't want to allow
|
|
|
|
* dev-replace to start anyway.
|
|
|
|
*/
|
2020-08-25 23:02:32 +08:00
|
|
|
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
|
2018-09-07 22:11:23 +08:00
|
|
|
down_write(&dev_replace->rwsem);
|
2018-11-11 22:22:18 +08:00
|
|
|
dev_replace->replace_state =
|
|
|
|
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
|
2018-09-07 22:11:23 +08:00
|
|
|
up_write(&dev_replace->rwsem);
|
2018-03-21 02:51:04 +08:00
|
|
|
btrfs_info(fs_info,
|
|
|
|
"cannot resume dev-replace, other exclusive operation running");
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
|
2013-07-15 09:50:32 +08:00
|
|
|
return PTR_ERR_OR_ZERO(task);
|
2012-11-06 00:33:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_dev_replace_kthread(void *data)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = data;
|
|
|
|
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
|
|
|
|
u64 progress;
|
2018-03-20 22:35:50 +08:00
|
|
|
int ret;
|
2012-11-06 00:33:06 +08:00
|
|
|
|
2017-06-14 22:28:42 +08:00
|
|
|
progress = btrfs_dev_replace_progress(fs_info);
|
|
|
|
progress = div_u64(progress, 10);
|
|
|
|
btrfs_info_in_rcu(fs_info,
|
2017-11-28 10:43:10 +08:00
|
|
|
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
|
|
|
|
btrfs_dev_name(dev_replace->srcdev),
|
2017-06-14 22:28:42 +08:00
|
|
|
dev_replace->srcdev->devid,
|
2017-11-28 10:43:10 +08:00
|
|
|
btrfs_dev_name(dev_replace->tgtdev),
|
2017-06-14 22:28:42 +08:00
|
|
|
(unsigned int)progress);
|
|
|
|
|
2012-11-06 00:33:06 +08:00
|
|
|
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
|
|
|
|
dev_replace->committed_cursor_left,
|
2014-09-03 21:35:38 +08:00
|
|
|
btrfs_device_get_total_bytes(dev_replace->srcdev),
|
2012-11-06 00:33:06 +08:00
|
|
|
&dev_replace->scrub_progress, 0, 1);
|
|
|
|
ret = btrfs_dev_replace_finishing(fs_info, ret);
|
2018-11-20 19:56:15 +08:00
|
|
|
WARN_ON(ret && ret != -ECANCELED);
|
2018-03-20 22:35:50 +08:00
|
|
|
|
2020-08-25 23:02:32 +08:00
|
|
|
btrfs_exclop_finish(fs_info);
|
2012-11-06 00:33:06 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-10-02 01:57:39 +08:00
|
|
|
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
|
2012-11-06 00:33:06 +08:00
|
|
|
{
|
|
|
|
if (!dev_replace->is_valid)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
switch (dev_replace->replace_state) {
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
|
|
|
|
return 0;
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
|
|
|
|
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
|
|
|
|
/*
|
|
|
|
* return true even if tgtdev is missing (this is
|
|
|
|
* something that can happen if the dev_replace
|
|
|
|
* procedure is suspended by an umount and then
|
|
|
|
* the tgtdev is missing (or "btrfs dev scan") was
|
2018-11-28 19:05:13 +08:00
|
|
|
* not called and the filesystem is remounted
|
2012-11-06 00:33:06 +08:00
|
|
|
* in degraded state. This does not stop the
|
|
|
|
* dev_replace procedure. It needs to be canceled
|
2016-03-05 03:23:12 +08:00
|
|
|
* manually if the cancellation is wanted.
|
2012-11-06 00:33:06 +08:00
|
|
|
*/
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2018-04-05 07:04:49 +08:00
|
|
|
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
2014-11-25 16:39:28 +08:00
|
|
|
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
{
|
2018-04-05 07:04:49 +08:00
|
|
|
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
|
|
|
|
cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
2015-01-20 15:11:37 +08:00
|
|
|
while (1) {
|
2018-04-05 07:04:49 +08:00
|
|
|
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
|
2015-01-20 15:11:37 +08:00
|
|
|
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
|
|
|
|
&fs_info->fs_state)))
|
|
|
|
break;
|
|
|
|
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
btrfs_bio_counter_dec(fs_info);
|
2018-04-05 07:04:49 +08:00
|
|
|
wait_event(fs_info->dev_replace.replace_wait,
|
Btrfs: fix use-after-free in the finishing procedure of the device replace
During device replace test, we hit a null pointer deference (It was very easy
to reproduce it by running xfstests' btrfs/011 on the devices with the virtio
scsi driver). There were two bugs that caused this problem:
- We might allocate new chunks on the replaced device after we updated
the mapping tree. And we forgot to replace the source device in those
mapping of the new chunks.
- We might get the mapping information which including the source device
before the mapping information update. And then submit the bio which was
based on that mapping information after we freed the source device.
For the first bug, we can fix it by doing mapping tree update and source
device remove in the same context of the chunk mutex. The chunk mutex is
used to protect the allocable device list, the above method can avoid
the new chunk allocation, and after we remove the source device, all
the new chunks will be allocated on the new device. So it can fix
the first bug.
For the second bug, we need make sure all flighting bios are finished and
no new bios are produced during we are removing the source device. To fix
this problem, we introduced a global @bio_counter, we not only inc/dec
@bio_counter outsize of map_blocks, but also inc it before submitting bio
and dec @bio_counter when ending bios.
Since Raid56 is a little different and device replace dosen't support raid56
yet, it is not addressed in the patch and I add comments to make sure we will
fix it in the future.
Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com>
Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com>
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
|
|
|
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
|
|
|
|
&fs_info->fs_state));
|
|
|
|
}
|
|
|
|
}
|