OpenCloudOS-Kernel/fs/btrfs/dev-replace.c

1079 lines
32 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) STRATO AG 2012. All rights reserved.
*/
#include <linux/sched.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/kthread.h>
#include <linux/math64.h>
#include "misc.h"
#include "ctree.h"
#include "extent_map.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
/*
* Device replace overview
*
* [Objective]
* To copy all extents (both new and on-disk) from source device to target
* device, while still keeping the filesystem read-write.
*
* [Method]
* There are two main methods involved:
*
* - Write duplication
*
* All new writes will be written to both target and source devices, so even
* if replace gets canceled, sources device still contans up-to-date data.
*
* Location: handle_ops_on_dev_replace() from __btrfs_map_block()
* Start: btrfs_dev_replace_start()
* End: btrfs_dev_replace_finishing()
* Content: Latest data/metadata
*
* - Copy existing extents
*
* This happens by re-using scrub facility, as scrub also iterates through
* existing extents from commit root.
*
* Location: scrub_write_block_to_dev_replace() from
* scrub_block_complete()
* Content: Data/meta from commit root.
*
* Due to the content difference, we need to avoid nocow write when dev-replace
* is happening. This is done by marking the block group read-only and waiting
* for NOCOW writes.
*
* After replace is done, the finishing part is done by swapping the target and
* source devices.
*
* Location: btrfs_dev_replace_update_device_in_mapping_tree() from
* btrfs_dev_replace_finishing()
*/
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret);
static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev);
static int btrfs_dev_replace_kthread(void *data);
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
{
struct btrfs_key key;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct extent_buffer *eb;
int slot;
int ret = 0;
struct btrfs_path *path = NULL;
int item_size;
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
if (ret) {
no_valid_dev_replace_entry_found:
ret = 0;
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->cont_reading_from_srcdev_mode =
BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
dev_replace->time_started = 0;
dev_replace->time_stopped = 0;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
dev_replace->cursor_left = 0;
dev_replace->committed_cursor_left = 0;
dev_replace->cursor_left_last_write_of_item = 0;
dev_replace->cursor_right = 0;
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
dev_replace->is_valid = 0;
dev_replace->item_needs_writeback = 0;
goto out;
}
slot = path->slots[0];
eb = path->nodes[0];
item_size = btrfs_item_size_nr(eb, slot);
ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
if (item_size != sizeof(struct btrfs_dev_replace_item)) {
btrfs_warn(fs_info,
"dev_replace entry found has unexpected size, ignore entry");
goto no_valid_dev_replace_entry_found;
}
src_devid = btrfs_dev_replace_src_devid(eb, ptr);
dev_replace->cont_reading_from_srcdev_mode =
btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
dev_replace->time_stopped =
btrfs_dev_replace_time_stopped(eb, ptr);
atomic64_set(&dev_replace->num_write_errors,
btrfs_dev_replace_num_write_errors(eb, ptr));
atomic64_set(&dev_replace->num_uncorrectable_read_errors,
btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
dev_replace->committed_cursor_left = dev_replace->cursor_left;
dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 0;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
src_devid, NULL, NULL, true);
dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID,
NULL, NULL, true);
/*
* allow 'btrfs dev replace_cancel' if src/tgt device is
* missing
*/
if (!dev_replace->srcdev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
btrfs_warn(fs_info,
"srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
src_devid);
}
if (!dev_replace->tgtdev &&
!btrfs_test_opt(fs_info, DEGRADED)) {
ret = -EIO;
btrfs_warn(fs_info,
"cannot mount because device replace operation is ongoing and");
btrfs_warn(fs_info,
"tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
BTRFS_DEV_REPLACE_DEVID);
}
if (dev_replace->tgtdev) {
if (dev_replace->srcdev) {
dev_replace->tgtdev->total_bytes =
dev_replace->srcdev->total_bytes;
dev_replace->tgtdev->disk_total_bytes =
dev_replace->srcdev->disk_total_bytes;
dev_replace->tgtdev->commit_total_bytes =
dev_replace->srcdev->commit_total_bytes;
dev_replace->tgtdev->bytes_used =
dev_replace->srcdev->bytes_used;
dev_replace->tgtdev->commit_bytes_used =
dev_replace->srcdev->commit_bytes_used;
}
set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&dev_replace->tgtdev->dev_state);
WARN_ON(fs_info->fs_devices->rw_devices == 0);
dev_replace->tgtdev->io_width = fs_info->sectorsize;
dev_replace->tgtdev->io_align = fs_info->sectorsize;
dev_replace->tgtdev->sector_size = fs_info->sectorsize;
dev_replace->tgtdev->fs_info = fs_info;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&dev_replace->tgtdev->dev_state);
}
break;
}
out:
btrfs_free_path(path);
return ret;
}
/*
* Initialize a new device for device replace target from a given source dev
* and path.
*
* Return 0 and new device in @device_out, otherwise return < 0
*/
static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
const char *device_path,
struct btrfs_device *srcdev,
struct btrfs_device **device_out)
{
struct btrfs_device *device;
struct block_device *bdev;
struct list_head *devices;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
*device_out = NULL;
if (fs_info->fs_devices->seeding) {
btrfs_err(fs_info, "the filesystem is a seed filesystem!");
return -EINVAL;
}
bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
fs_info->bdev_holder);
if (IS_ERR(bdev)) {
btrfs_err(fs_info, "target device %s is invalid!", device_path);
return PTR_ERR(bdev);
}
sync_blockdev(bdev);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
if (device->bdev == bdev) {
btrfs_err(fs_info,
"target device is in the filesystem!");
ret = -EEXIST;
goto error;
}
}
if (i_size_read(bdev->bd_inode) <
btrfs_device_get_total_bytes(srcdev)) {
btrfs_err(fs_info,
"target device is smaller than source device!");
ret = -EINVAL;
goto error;
}
device = btrfs_alloc_device(NULL, &devid, NULL);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
device->generation = 0;
device->io_width = fs_info->sectorsize;
device->io_align = fs_info->sectorsize;
device->sector_size = fs_info->sectorsize;
device->total_bytes = btrfs_device_get_total_bytes(srcdev);
device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
device->bytes_used = btrfs_device_get_bytes_used(srcdev);
device->commit_total_bytes = srcdev->commit_total_bytes;
device->commit_bytes_used = device->bytes_used;
device->fs_info = fs_info;
device->bdev = bdev;
set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
device->mode = FMODE_EXCL;
device->dev_stats_valid = 1;
set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
device->fs_devices = fs_info->fs_devices;
mutex_lock(&fs_info->fs_devices->device_list_mutex);
list_add(&device->dev_list, &fs_info->fs_devices->devices);
fs_info->fs_devices->num_devices++;
fs_info->fs_devices->open_devices++;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
*device_out = device;
return 0;
error:
blkdev_put(bdev, FMODE_EXCL);
return ret;
}
/*
* called from commit_transaction. Writes changed device replace state to
* disk.
*/
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
int ret;
struct btrfs_root *dev_root = fs_info->dev_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *eb;
struct btrfs_dev_replace_item *ptr;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
down_read(&dev_replace->rwsem);
if (!dev_replace->is_valid ||
!dev_replace->item_needs_writeback) {
up_read(&dev_replace->rwsem);
return 0;
}
up_read(&dev_replace->rwsem);
key.objectid = 0;
key.type = BTRFS_DEV_REPLACE_KEY;
key.offset = 0;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
if (ret < 0) {
btrfs_warn(fs_info,
"error %d while searching for dev_replace item!",
ret);
goto out;
}
if (ret == 0 &&
btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
/*
* need to delete old one and insert a new one.
* Since no attempt is made to recover any old state, if the
* dev_replace state is 'running', the data on the target
* drive is lost.
* It would be possible to recover the state: just make sure
* that the beginning of the item is never changed and always
* contains all the essential information. Then read this
* minimal set of information and use it as a base for the
* new state.
*/
ret = btrfs_del_item(trans, dev_root, path);
if (ret != 0) {
btrfs_warn(fs_info,
"delete too small dev_replace item failed %d!",
ret);
goto out;
}
ret = 1;
}
if (ret == 1) {
/* need to insert a new item */
btrfs_release_path(path);
ret = btrfs_insert_empty_item(trans, dev_root, path,
&key, sizeof(*ptr));
if (ret < 0) {
btrfs_warn(fs_info,
"insert dev_replace item failed %d!", ret);
goto out;
}
}
eb = path->nodes[0];
ptr = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_dev_replace_item);
down_write(&dev_replace->rwsem);
if (dev_replace->srcdev)
btrfs_set_dev_replace_src_devid(eb, ptr,
dev_replace->srcdev->devid);
else
btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
dev_replace->cont_reading_from_srcdev_mode);
btrfs_set_dev_replace_replace_state(eb, ptr,
dev_replace->replace_state);
btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
btrfs_set_dev_replace_num_write_errors(eb, ptr,
atomic64_read(&dev_replace->num_write_errors));
btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
atomic64_read(&dev_replace->num_uncorrectable_read_errors));
dev_replace->cursor_left_last_write_of_item =
dev_replace->cursor_left;
btrfs_set_dev_replace_cursor_left(eb, ptr,
dev_replace->cursor_left_last_write_of_item);
btrfs_set_dev_replace_cursor_right(eb, ptr,
dev_replace->cursor_right);
dev_replace->item_needs_writeback = 0;
up_write(&dev_replace->rwsem);
btrfs_mark_buffer_dirty(eb);
out:
btrfs_free_path(path);
return ret;
}
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
return rcu_str_deref(device->name);
}
static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
int read_src)
{
struct btrfs_root *root = fs_info->dev_root;
struct btrfs_trans_handle *trans;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
srcdev_name);
if (IS_ERR(src_device))
return PTR_ERR(src_device);
if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
btrfs_warn_in_rcu(fs_info,
"cannot replace device %s (devid %llu) due to active swapfile",
btrfs_dev_name(src_device), src_device->devid);
return -ETXTBSY;
}
/*
* Here we commit the transaction to make sure commit_total_bytes
* of all the devices are updated.
*/
trans = btrfs_attach_transaction(root);
if (!IS_ERR(trans)) {
ret = btrfs_commit_transaction(trans);
if (ret)
return ret;
} else if (PTR_ERR(trans) != -ENOENT) {
return PTR_ERR(trans);
}
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
if (ret)
return ret;
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
up_write(&dev_replace->rwsem);
goto leave;
}
dev_replace->cont_reading_from_srcdev_mode = read_src;
dev_replace->srcdev = src_device;
dev_replace->tgtdev = tgt_device;
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
/*
* from now on, the writes to the srcdev are all duplicated to
* go to the tgtdev as well (refer to btrfs_map_block()).
*/
dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
dev_replace->time_started = ktime_get_real_seconds();
dev_replace->cursor_left = 0;
dev_replace->committed_cursor_left = 0;
dev_replace->cursor_left_last_write_of_item = 0;
dev_replace->cursor_right = 0;
dev_replace->is_valid = 1;
dev_replace->item_needs_writeback = 1;
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
up_write(&dev_replace->rwsem);
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
btrfs_err(fs_info, "kobj add dev failed %d", ret);
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
/* Commit dev_replace state and reserve 1 item for it. */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
dev_replace->srcdev = NULL;
dev_replace->tgtdev = NULL;
up_write(&dev_replace->rwsem);
goto leave;
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
/* the disk copy procedure reuses the scrub code */
ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
btrfs_device_get_total_bytes(src_device),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
btrfs: dev-replace: remove warning for unknown return codes when finished The fstests btrfs/011 triggered a warning at the end of device replace, [ 1891.998975] BTRFS warning (device vdd): failed setting block group ro: -28 [ 1892.038338] BTRFS error (device vdd): btrfs_scrub_dev(/dev/vdd, 1, /dev/vdb) failed -28 [ 1892.059993] ------------[ cut here ]------------ [ 1892.063032] WARNING: CPU: 2 PID: 2244 at fs/btrfs/dev-replace.c:506 btrfs_dev_replace_start.cold+0xf9/0x140 [btrfs] [ 1892.074346] CPU: 2 PID: 2244 Comm: btrfs Not tainted 5.5.0-rc7-default+ #942 [ 1892.079956] RIP: 0010:btrfs_dev_replace_start.cold+0xf9/0x140 [btrfs] [ 1892.096576] RSP: 0018:ffffbb58c7b3fd10 EFLAGS: 00010286 [ 1892.098311] RAX: 00000000ffffffe4 RBX: 0000000000000001 RCX: 8888888888888889 [ 1892.100342] RDX: 0000000000000001 RSI: ffff9e889645f5d8 RDI: ffffffff92821080 [ 1892.102291] RBP: ffff9e889645c000 R08: 000001b8878fe1f6 R09: 0000000000000000 [ 1892.104239] R10: ffffbb58c7b3fd08 R11: 0000000000000000 R12: ffff9e88a0017000 [ 1892.106434] R13: ffff9e889645f608 R14: ffff9e88794e1000 R15: ffff9e88a07b5200 [ 1892.108642] FS: 00007fcaed3f18c0(0000) GS:ffff9e88bda00000(0000) knlGS:0000000000000000 [ 1892.111558] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1892.113492] CR2: 00007f52509ff420 CR3: 00000000603dd002 CR4: 0000000000160ee0 [ 1892.115814] Call Trace: [ 1892.116896] btrfs_dev_replace_by_ioctl+0x35/0x60 [btrfs] [ 1892.118962] btrfs_ioctl+0x1d62/0x2550 [btrfs] caused by the previous patch ("btrfs: scrub: Require mandatory block group RO for dev-replace"). Hitting ENOSPC is possible and could happen when the block group is set read-only, preventing NOCOW writes to the area that's being accessed by dev-replace. This has happend with scratch devices of size 12G but not with 5G and 20G, so this is depends on timing and other activity on the filesystem. The whole replace operation is restartable, the space state should be examined by the user in any case. The error code is propagated back to the ioctl caller so the kernel warning is causing false alerts. Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-25 19:35:38 +08:00
if (ret == -EINPROGRESS)
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
return ret;
leave:
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
int ret;
switch (args->start.cont_reading_from_srcdev_mode) {
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
break;
default:
return -EINVAL;
}
if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
args->start.tgtdev_name[0] == '\0')
return -EINVAL;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
args->start.srcdev_name,
args->start.cont_reading_from_srcdev_mode);
args->result = ret;
/* don't warn if EINPROGRESS, someone else might be running scrub */
if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
return 0;
return ret;
}
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
/*
* blocked until all in-flight bios operations are finished.
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
*/
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
&fs_info->dev_replace.bio_counter));
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
}
/*
* we have removed target device, it is safe to allow new bios request.
*/
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
wake_up(&fs_info->dev_replace.replace_wait);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
u8 uuid_tmp[BTRFS_UUID_SIZE];
struct btrfs_trans_handle *trans;
int ret = 0;
/* don't allow cancel or unmount to disturb the finishing procedure */
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
down_read(&dev_replace->rwsem);
/* was the operation canceled, or is it finished? */
if (dev_replace->replace_state !=
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
up_read(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
up_read(&dev_replace->rwsem);
/*
* flush all outstanding I/O and inode extent mappings before the
* copy operation is declared as being finished
*/
ret = btrfs_start_delalloc_roots(fs_info, -1);
if (ret) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return ret;
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
btrfs: Ensure replaced device doesn't have pending chunk allocation Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update operations during transaction commit") combined the way certain operations are recoded in a transaction. As a result an ASSERT was added in dev_replace_finish to ensure the new code works correctly. Unfortunately I got reports that it's possible to trigger the assert, meaning that during a device replace it's possible to have an unfinished chunk allocation on the source device. This is supposed to be prevented by the fact that a transaction is committed before finishing the replace oepration and alter acquiring the chunk mutex. This is not sufficient since by the time the transaction is committed and the chunk mutex acquired it's possible to allocate a chunk depending on the workload being executed on the replaced device. This bug has been present ever since device replace was introduced but there was never code which checks for it. The correct way to fix is to ensure that there is no pending device modification operation when the chunk mutex is acquire and if there is repeat transaction commit. Unfortunately it's not possible to just exclude the source device from btrfs_fs_devices::dev_alloc_list since this causes ENOSPC to be hit in transaction commit. Fixing that in another way would need to add special cases to handle the last writes and forbid new ones. The looped transaction fix is more obvious, and can be easily backported. The runtime of dev-replace is long so there's no noticeable delay caused by that. Reported-by: David Sterba <dsterba@suse.com> Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace") CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Nikolay Borisov <nborisov@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-17 15:44:25 +08:00
/*
* We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new
* chunks shouldn't be allocated on the device.
*/
while (1) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
}
}
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
: BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
/* replace old device with new one in mapping tree */
if (!scrub_ret) {
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
} else {
if (scrub_ret != -ECANCELED)
btrfs_err_in_rcu(fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs: Wait for in-flight bios before freeing target device for raid56 When raid56 dev-replace is cancelled by running scrub, we will free target device without waiting for in-flight bios, causing the following NULL pointer deference or general protection failure. BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0 IP: generic_make_request_checks+0x4d/0x610 CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs] task: ffff88002875b4c0 task.stack: ffffc90001334000 RIP: 0010:generic_make_request_checks+0x4d/0x610 Call Trace: ? generic_make_request+0xc7/0x360 generic_make_request+0x24/0x360 ? generic_make_request+0xc7/0x360 submit_bio+0x64/0x120 ? page_in_rbio+0x4d/0x80 [btrfs] ? rbio_orig_end_io+0x80/0x80 [btrfs] finish_rmw+0x3f4/0x540 [btrfs] validate_rbio_for_rmw+0x36/0x40 [btrfs] raid_rmw_end_io+0x7a/0x90 [btrfs] bio_endio+0x56/0x60 end_workqueue_fn+0x3c/0x40 [btrfs] btrfs_scrubparity_helper+0xef/0x620 [btrfs] btrfs_endio_raid56_helper+0xe/0x10 [btrfs] process_one_work+0x2af/0x720 ? process_one_work+0x22b/0x720 worker_thread+0x4b/0x4f0 kthread+0x10f/0x150 ? process_one_work+0x720/0x720 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8 In btrfs_dev_replace_finishing(), we will call btrfs_rm_dev_replace_blocked() to wait bios before destroying the target device when scrub is finished normally. However when dev-replace is aborted, either due to error or cancelled by scrub, we didn't wait for bios, this can lead to use-after-free if there are bios holding the target device. Furthermore, for raid56 scrub, at least 2 places are calling btrfs_map_sblock() without protection of bio_counter, leading to the problem. This patch fixes the problem: 1) Wait for bio_counter before freeing target device when canceling replace 2) When calling btrfs_map_sblock() for raid56, use bio_counter to protect the call. Cc: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 09:33:21 +08:00
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
btrfs: Wait for in-flight bios before freeing target device for raid56 When raid56 dev-replace is cancelled by running scrub, we will free target device without waiting for in-flight bios, causing the following NULL pointer deference or general protection failure. BUG: unable to handle kernel NULL pointer dereference at 00000000000005e0 IP: generic_make_request_checks+0x4d/0x610 CPU: 1 PID: 11676 Comm: kworker/u4:14 Tainted: G O 4.11.0-rc2 #72 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Workqueue: btrfs-endio-raid56 btrfs_endio_raid56_helper [btrfs] task: ffff88002875b4c0 task.stack: ffffc90001334000 RIP: 0010:generic_make_request_checks+0x4d/0x610 Call Trace: ? generic_make_request+0xc7/0x360 generic_make_request+0x24/0x360 ? generic_make_request+0xc7/0x360 submit_bio+0x64/0x120 ? page_in_rbio+0x4d/0x80 [btrfs] ? rbio_orig_end_io+0x80/0x80 [btrfs] finish_rmw+0x3f4/0x540 [btrfs] validate_rbio_for_rmw+0x36/0x40 [btrfs] raid_rmw_end_io+0x7a/0x90 [btrfs] bio_endio+0x56/0x60 end_workqueue_fn+0x3c/0x40 [btrfs] btrfs_scrubparity_helper+0xef/0x620 [btrfs] btrfs_endio_raid56_helper+0xe/0x10 [btrfs] process_one_work+0x2af/0x720 ? process_one_work+0x22b/0x720 worker_thread+0x4b/0x4f0 kthread+0x10f/0x150 ? process_one_work+0x720/0x720 ? kthread_create_on_node+0x40/0x40 ret_from_fork+0x2e/0x40 RIP: generic_make_request_checks+0x4d/0x610 RSP: ffffc90001337bb8 In btrfs_dev_replace_finishing(), we will call btrfs_rm_dev_replace_blocked() to wait bios before destroying the target device when scrub is finished normally. However when dev-replace is aborted, either due to error or cancelled by scrub, we didn't wait for bios, this can lead to use-after-free if there are bios holding the target device. Furthermore, for raid56 scrub, at least 2 places are calling btrfs_map_sblock() without protection of bio_counter, leading to the problem. This patch fixes the problem: 1) Wait for bio_counter before freeing target device when canceling replace 2) When calling btrfs_map_sblock() for raid56, use bio_counter to protect the call. Cc: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2017-03-29 09:33:21 +08:00
btrfs_rm_dev_replace_unblocked(fs_info);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return scrub_ret;
}
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;
memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
btrfs_device_set_disk_total_bytes(tgt_device,
src_device->disk_total_bytes);
btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
tgt_device->commit_bytes_used = src_device->bytes_used;
btrfs_assign_next_active_device(src_device, tgt_device);
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
btrfs_rm_dev_replace_blocked(fs_info);
btrfs_rm_dev_replace_remove_srcdev(src_device);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
btrfs_rm_dev_replace_unblocked(fs_info);
/*
* Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
* update on-disk dev stats value during commit transaction
*/
atomic_inc(&tgt_device->dev_stats_ccnt);
/*
* this is again a consistent state where no dev_replace procedure
* is running, the target device is part of the filesystem, the
* source device is not part of the filesystem anymore and its 1st
* superblock is scratched out so that it is no longer marked to
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 16:52:31 +08:00
/* replace the sysfs entry */
btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
btrfs: sysfs, add devid/dev_state kobject and device attributes New sysfs attributes that track the filesystem status of devices, stored in the per-filesystem directory in /sys/fs/btrfs/FSID/devinfo . There's a directory for each device, with name corresponding to the numerical device id. in_fs_metadata - device is in the list of fs metadata missing - device is missing (no device node or block device) replace_target - device is target of replace writeable - writes from fs are allowed These attributes reflect the state of the device::dev_state and created at mount time. Sample output: $ pwd /sys/fs/btrfs/6e1961f1-5918-4ecc-a22f-948897b409f7/devinfo/1/ $ ls in_fs_metadata missing replace_target writeable $ cat missing 0 The output from these attributes are 0 or 1. 0 indicates unset and 1 indicates set. These attributes are readonly. It is observed that the device delete thread and sysfs read thread will not race because the delete thread calls sysfs kobject_put() which in turn waits for existing sysfs read to complete. Note for device replace devid swap: During the replace the target device temporarily assumes devid 0 before assigning the devid of the soruce device. In btrfs_dev_replace_finishing() we remove source sysfs devid using the function btrfs_sysfs_remove_devices_attr(), so after that call kobject_rename() to update the devid in the sysfs. This adds and calls btrfs_sysfs_update_devid() helper function to update the device id. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ update changelog ] Signed-off-by: David Sterba <dsterba@suse.com>
2020-01-06 19:38:31 +08:00
btrfs_sysfs_update_devid(tgt_device);
btrfs_rm_dev_replace_free_srcdev(src_device);
btrfs: Fix a lockdep warning when running xfstest. The following lockdep warning is triggered during xfstests: [ 1702.980872] ========================================================= [ 1702.981181] [ INFO: possible irq lock inversion dependency detected ] [ 1702.981482] 3.18.0-rc1 #27 Not tainted [ 1702.981781] --------------------------------------------------------- [ 1702.982095] kswapd0/77 just changed the state of lock: [ 1702.982415] (&delayed_node->mutex){+.+.-.}, at: [<ffffffffa03b0b51>] __btrfs_release_delayed_node+0x41/0x1f0 [btrfs] [ 1702.982794] but this lock took another, RECLAIM_FS-unsafe lock in the past: [ 1702.983160] (&fs_info->dev_replace.lock){+.+.+.} and interrupts could create inverse lock ordering between them. [ 1702.984675] other info that might help us debug this: [ 1702.985524] Chain exists of: &delayed_node->mutex --> &found->groups_sem --> &fs_info->dev_replace.lock [ 1702.986799] Possible interrupt unsafe locking scenario: [ 1702.987681] CPU0 CPU1 [ 1702.988137] ---- ---- [ 1702.988598] lock(&fs_info->dev_replace.lock); [ 1702.989069] local_irq_disable(); [ 1702.989534] lock(&delayed_node->mutex); [ 1702.990038] lock(&found->groups_sem); [ 1702.990494] <Interrupt> [ 1702.990938] lock(&delayed_node->mutex); [ 1702.991407] *** DEADLOCK *** It is because the btrfs_kobj_{add/rm}_device() will call memory allocation with GFP_KERNEL, which may flush fs page cache to free space, waiting for it self to do the commit, causing the deadlock. To solve the problem, move btrfs_kobj_{add/rm}_device() out of the dev_replace lock range, also involing split the btrfs_rm_dev_replace_srcdev() function into remove and free parts. Now only btrfs_rm_dev_replace_remove_srcdev() is called in dev_replace lock range, and kobj_{add/rm} and btrfs_rm_dev_replace_free_srcdev() are called out of the lock range. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-10-30 16:52:31 +08:00
/* write back the superblocks */
trans = btrfs_start_transaction(root, 0);
if (!IS_ERR(trans))
btrfs_commit_transaction(trans);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return 0;
}
static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_fs_info *fs_info,
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct extent_map *em;
struct map_lookup *map;
u64 start = 0;
int i;
write_lock(&em_tree->lock);
do {
em = lookup_extent_mapping(em_tree, start, (u64)-1);
if (!em)
break;
map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
start = em->start + em->len;
free_extent_map(em);
} while (start);
write_unlock(&em_tree->lock);
}
/*
* Read progress of device replace status according to the state and last
* stored position. The value format is the same as for
* btrfs_dev_replace::progress_1000
*/
static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 ret = 0;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
ret = 0;
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
ret = 1000;
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
ret = div64_u64(dev_replace->cursor_left,
div_u64(btrfs_device_get_total_bytes(
dev_replace->srcdev), 1000));
break;
}
return ret;
}
void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
down_read(&dev_replace->rwsem);
/* even if !dev_replace_is_valid, the values are good enough for
* the replace_status ioctl */
args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
args->status.replace_state = dev_replace->replace_state;
args->status.time_started = dev_replace->time_started;
args->status.time_stopped = dev_replace->time_stopped;
args->status.num_write_errors =
atomic64_read(&dev_replace->num_write_errors);
args->status.num_uncorrectable_read_errors =
atomic64_read(&dev_replace->num_uncorrectable_read_errors);
args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
up_read(&dev_replace->rwsem);
}
int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
struct btrfs_trans_handle *trans;
struct btrfs_root *root = fs_info->tree_root;
int result;
int ret;
if (sb_rdonly(fs_info->sb))
return -EROFS;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
up_write(&dev_replace->rwsem);
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
up_write(&dev_replace->rwsem);
ret = btrfs_scrub_cancel(fs_info);
if (ret < 0) {
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
} else {
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
/*
* btrfs_dev_replace_finishing() will handle the
* cleanup part
*/
btrfs_info_in_rcu(fs_info,
"dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
}
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
/*
* Scrub doing the replace isn't running so we need to do the
* cleanup step of btrfs_dev_replace_finishing() here
*/
result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
tgt_device = dev_replace->tgtdev;
src_device = dev_replace->srcdev;
dev_replace->tgtdev = NULL;
dev_replace->srcdev = NULL;
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
ret = btrfs_scrub_cancel(fs_info);
ASSERT(ret != -ENOTCONN);
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans);
}
ret = btrfs_commit_transaction(trans);
WARN_ON(ret);
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
btrfs_info_in_rcu(fs_info,
"suspended dev_replace from %s (devid %llu) to %s canceled",
btrfs_dev_name(src_device), src_device->devid,
btrfs_dev_name(tgt_device));
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
break;
default:
up_write(&dev_replace->rwsem);
btrfs: fix use-after-free due to race between replace start and cancel The device replace cancel thread can race with the replace start thread and if fs_info::scrubs_running is not yet set, btrfs_scrub_cancel() will fail to stop the scrub thread. The scrub thread continues with the scrub for replace which then will try to write to the target device and which is already freed by the cancel thread. scrub_setup_ctx() warns as tgtdev is NULL. struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) { ... if (is_dev_replace) { WARN_ON(!fs_info->dev_replace.tgtdev); <=== sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO; sctx->wr_tgtdev = fs_info->dev_replace.tgtdev; sctx->flush_all_writes = false; } [ 6724.497655] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc started [ 6753.945017] BTRFS info (device sdb): dev_replace from /dev/sdb (devid 1) to /dev/sdc canceled [ 6852.426700] WARNING: CPU: 0 PID: 4494 at fs/btrfs/scrub.c:622 scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.428928] RIP: 0010:scrub_setup_ctx.isra.19+0x220/0x230 [btrfs] ... [ 6852.432970] Call Trace: [ 6852.433202] btrfs_scrub_dev+0x19b/0x5c0 [btrfs] [ 6852.433471] btrfs_dev_replace_start+0x48c/0x6a0 [btrfs] [ 6852.433800] btrfs_dev_replace_by_ioctl+0x3a/0x60 [btrfs] [ 6852.434097] btrfs_ioctl+0x2476/0x2d20 [btrfs] [ 6852.434365] ? do_sigaction+0x7d/0x1e0 [ 6852.434623] do_vfs_ioctl+0xa9/0x6c0 [ 6852.434865] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435124] ? syscall_trace_enter+0x1c8/0x310 [ 6852.435387] ksys_ioctl+0x60/0x90 [ 6852.435663] __x64_sys_ioctl+0x16/0x20 [ 6852.435907] do_syscall_64+0x50/0x180 [ 6852.436150] entry_SYSCALL_64_after_hwframe+0x49/0xbe Further, as the replace thread enters scrub_write_page_to_dev_replace() without the target device it panics: static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, struct scrub_page *spage) { ... bio_set_dev(bio, sbio->dev->bdev); <====== [ 6929.715145] BUG: unable to handle kernel NULL pointer dereference at 00000000000000a0 .. [ 6929.717106] Workqueue: btrfs-scrub btrfs_scrub_helper [btrfs] [ 6929.717420] RIP: 0010:scrub_write_page_to_dev_replace+0xb4/0x260 [btrfs] .. [ 6929.721430] Call Trace: [ 6929.721663] scrub_write_block_to_dev_replace+0x3f/0x60 [btrfs] [ 6929.721975] scrub_bio_end_io_worker+0x1af/0x490 [btrfs] [ 6929.722277] normal_work_helper+0xf0/0x4c0 [btrfs] [ 6929.722552] process_one_work+0x1f4/0x520 [ 6929.722805] ? process_one_work+0x16e/0x520 [ 6929.723063] worker_thread+0x46/0x3d0 [ 6929.723313] kthread+0xf8/0x130 [ 6929.723544] ? process_one_work+0x520/0x520 [ 6929.723800] ? kthread_delayed_work_timer_fn+0x80/0x80 [ 6929.724081] ret_from_fork+0x3a/0x50 Fix this by letting the btrfs_dev_replace_finishing() to do the job of cleaning after the cancel, including freeing of the target device. btrfs_dev_replace_finishing() is called when btrfs_scub_dev() returns along with the scrub return status. Signed-off-by: Anand Jain <anand.jain@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-14 13:50:26 +08:00
result = -EINVAL;
}
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return result;
}
void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
btrfs_info(fs_info, "suspending dev_replace for unmount");
break;
}
up_write(&dev_replace->rwsem);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
}
/* resume dev_replace procedure that was interrupted by unmount */
int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
{
struct task_struct *task;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
down_write(&dev_replace->rwsem);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
up_write(&dev_replace->rwsem);
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
break;
}
if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
btrfs_info(fs_info,
"cannot continue dev_replace, tgtdev is missing");
btrfs_info(fs_info,
"you may cancel the operation after 'mount -o degraded'");
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
up_write(&dev_replace->rwsem);
return 0;
}
up_write(&dev_replace->rwsem);
/*
* This could collide with a paused balance, but the exclusive op logic
* should never allow both to start and pause. We don't want to allow
* dev-replace to start anyway.
*/
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
down_write(&dev_replace->rwsem);
dev_replace->replace_state =
BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
up_write(&dev_replace->rwsem);
btrfs_info(fs_info,
"cannot resume dev-replace, other exclusive operation running");
return 0;
}
task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
return PTR_ERR_OR_ZERO(task);
}
static int btrfs_dev_replace_kthread(void *data)
{
struct btrfs_fs_info *fs_info = data;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
u64 progress;
int ret;
progress = btrfs_dev_replace_progress(fs_info);
progress = div_u64(progress, 10);
btrfs_info_in_rcu(fs_info,
"continuing dev_replace from %s (devid %llu) to target %s @%u%%",
btrfs_dev_name(dev_replace->srcdev),
dev_replace->srcdev->devid,
btrfs_dev_name(dev_replace->tgtdev),
(unsigned int)progress);
ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
dev_replace->committed_cursor_left,
btrfs_device_get_total_bytes(dev_replace->srcdev),
&dev_replace->scrub_progress, 0, 1);
ret = btrfs_dev_replace_finishing(fs_info, ret);
WARN_ON(ret && ret != -ECANCELED);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return 0;
}
int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
{
if (!dev_replace->is_valid)
return 0;
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
return 0;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
/*
* return true even if tgtdev is missing (this is
* something that can happen if the dev_replace
* procedure is suspended by an umount and then
* the tgtdev is missing (or "btrfs dev scan") was
* not called and the filesystem is remounted
* in degraded state. This does not stop the
* dev_replace procedure. It needs to be canceled
* manually if the cancellation is wanted.
*/
break;
}
return 1;
}
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
while (1) {
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state)))
break;
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
btrfs_bio_counter_dec(fs_info);
wait_event(fs_info->dev_replace.replace_wait,
Btrfs: fix use-after-free in the finishing procedure of the device replace During device replace test, we hit a null pointer deference (It was very easy to reproduce it by running xfstests' btrfs/011 on the devices with the virtio scsi driver). There were two bugs that caused this problem: - We might allocate new chunks on the replaced device after we updated the mapping tree. And we forgot to replace the source device in those mapping of the new chunks. - We might get the mapping information which including the source device before the mapping information update. And then submit the bio which was based on that mapping information after we freed the source device. For the first bug, we can fix it by doing mapping tree update and source device remove in the same context of the chunk mutex. The chunk mutex is used to protect the allocable device list, the above method can avoid the new chunk allocation, and after we remove the source device, all the new chunks will be allocated on the new device. So it can fix the first bug. For the second bug, we need make sure all flighting bios are finished and no new bios are produced during we are removing the source device. To fix this problem, we introduced a global @bio_counter, we not only inc/dec @bio_counter outsize of map_blocks, but also inc it before submitting bio and dec @bio_counter when ending bios. Since Raid56 is a little different and device replace dosen't support raid56 yet, it is not addressed in the patch and I add comments to make sure we will fix it in the future. Reported-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Signed-off-by: Wang Shilong <wangsl.fnst@cn.fujitsu.com> Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fb.com>
2014-01-30 16:46:55 +08:00
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
}
}