btrfs: scrub: Introduce full stripe lock for RAID56
Unlike mirror based profiles, RAID5/6 recovery needs to read out the whole full stripe. And if we don't do proper protection, it can easily cause race condition. Introduce 2 new functions: lock_full_stripe() and unlock_full_stripe() for RAID5/6. Which store a rb_tree of mutexes for full stripes, so scrub callers can use them to lock a full stripe to avoid race. Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Reviewed-by: David Sterba <dsterba@suse.com> [ minor comment adjustments ] Signed-off-by: David Sterba <dsterba@suse.com>
This commit is contained in:
parent
fa7aede2ab
commit
0966a7b130
|
@ -539,6 +539,14 @@ struct btrfs_io_ctl {
|
||||||
unsigned check_crcs:1;
|
unsigned check_crcs:1;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tree to record all locked full stripes of a RAID5/6 block group
|
||||||
|
*/
|
||||||
|
struct btrfs_full_stripe_locks_tree {
|
||||||
|
struct rb_root root;
|
||||||
|
struct mutex lock;
|
||||||
|
};
|
||||||
|
|
||||||
struct btrfs_block_group_cache {
|
struct btrfs_block_group_cache {
|
||||||
struct btrfs_key key;
|
struct btrfs_key key;
|
||||||
struct btrfs_block_group_item item;
|
struct btrfs_block_group_item item;
|
||||||
|
@ -649,6 +657,9 @@ struct btrfs_block_group_cache {
|
||||||
* Protected by free_space_lock.
|
* Protected by free_space_lock.
|
||||||
*/
|
*/
|
||||||
int needs_free_space;
|
int needs_free_space;
|
||||||
|
|
||||||
|
/* Record locked full stripes for RAID5/6 block group */
|
||||||
|
struct btrfs_full_stripe_locks_tree full_stripe_locks_root;
|
||||||
};
|
};
|
||||||
|
|
||||||
/* delayed seq elem */
|
/* delayed seq elem */
|
||||||
|
@ -3653,6 +3664,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
|
||||||
struct btrfs_device *dev);
|
struct btrfs_device *dev);
|
||||||
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
|
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
|
||||||
struct btrfs_scrub_progress *progress);
|
struct btrfs_scrub_progress *progress);
|
||||||
|
static inline void btrfs_init_full_stripe_locks_tree(
|
||||||
|
struct btrfs_full_stripe_locks_tree *locks_root)
|
||||||
|
{
|
||||||
|
locks_root->root = RB_ROOT;
|
||||||
|
mutex_init(&locks_root->lock);
|
||||||
|
}
|
||||||
|
|
||||||
/* dev-replace.c */
|
/* dev-replace.c */
|
||||||
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
|
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
|
||||||
|
|
|
@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
|
||||||
if (atomic_dec_and_test(&cache->count)) {
|
if (atomic_dec_and_test(&cache->count)) {
|
||||||
WARN_ON(cache->pinned > 0);
|
WARN_ON(cache->pinned > 0);
|
||||||
WARN_ON(cache->reserved > 0);
|
WARN_ON(cache->reserved > 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If not empty, someone is still holding mutex of
|
||||||
|
* full_stripe_lock, which can only be released by caller.
|
||||||
|
* And it will definitely cause use-after-free when caller
|
||||||
|
* tries to release full stripe lock.
|
||||||
|
*
|
||||||
|
* No better way to resolve, but only to warn.
|
||||||
|
*/
|
||||||
|
WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
|
||||||
kfree(cache->free_space_ctl);
|
kfree(cache->free_space_ctl);
|
||||||
kfree(cache);
|
kfree(cache);
|
||||||
}
|
}
|
||||||
|
@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info,
|
||||||
btrfs_init_free_space_ctl(cache);
|
btrfs_init_free_space_ctl(cache);
|
||||||
atomic_set(&cache->trimming, 0);
|
atomic_set(&cache->trimming, 0);
|
||||||
mutex_init(&cache->free_space_lock);
|
mutex_init(&cache->free_space_lock);
|
||||||
|
btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root);
|
||||||
|
|
||||||
return cache;
|
return cache;
|
||||||
}
|
}
|
||||||
|
|
223
fs/btrfs/scrub.c
223
fs/btrfs/scrub.c
|
@ -240,6 +240,13 @@ struct scrub_warning {
|
||||||
struct btrfs_device *dev;
|
struct btrfs_device *dev;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct full_stripe_lock {
|
||||||
|
struct rb_node node;
|
||||||
|
u64 logical;
|
||||||
|
u64 refs;
|
||||||
|
struct mutex mutex;
|
||||||
|
};
|
||||||
|
|
||||||
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
|
static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
|
||||||
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
|
static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
|
||||||
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
|
static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
|
||||||
|
@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
|
||||||
scrub_pause_off(fs_info);
|
scrub_pause_off(fs_info);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Insert new full stripe lock into full stripe locks tree
|
||||||
|
*
|
||||||
|
* Return pointer to existing or newly inserted full_stripe_lock structure if
|
||||||
|
* everything works well.
|
||||||
|
* Return ERR_PTR(-ENOMEM) if we failed to allocate memory
|
||||||
|
*
|
||||||
|
* NOTE: caller must hold full_stripe_locks_root->lock before calling this
|
||||||
|
* function
|
||||||
|
*/
|
||||||
|
static struct full_stripe_lock *insert_full_stripe_lock(
|
||||||
|
struct btrfs_full_stripe_locks_tree *locks_root,
|
||||||
|
u64 fstripe_logical)
|
||||||
|
{
|
||||||
|
struct rb_node **p;
|
||||||
|
struct rb_node *parent = NULL;
|
||||||
|
struct full_stripe_lock *entry;
|
||||||
|
struct full_stripe_lock *ret;
|
||||||
|
|
||||||
|
WARN_ON(!mutex_is_locked(&locks_root->lock));
|
||||||
|
|
||||||
|
p = &locks_root->root.rb_node;
|
||||||
|
while (*p) {
|
||||||
|
parent = *p;
|
||||||
|
entry = rb_entry(parent, struct full_stripe_lock, node);
|
||||||
|
if (fstripe_logical < entry->logical) {
|
||||||
|
p = &(*p)->rb_left;
|
||||||
|
} else if (fstripe_logical > entry->logical) {
|
||||||
|
p = &(*p)->rb_right;
|
||||||
|
} else {
|
||||||
|
entry->refs++;
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Insert new lock */
|
||||||
|
ret = kmalloc(sizeof(*ret), GFP_KERNEL);
|
||||||
|
if (!ret)
|
||||||
|
return ERR_PTR(-ENOMEM);
|
||||||
|
ret->logical = fstripe_logical;
|
||||||
|
ret->refs = 1;
|
||||||
|
mutex_init(&ret->mutex);
|
||||||
|
|
||||||
|
rb_link_node(&ret->node, parent, p);
|
||||||
|
rb_insert_color(&ret->node, &locks_root->root);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Search for a full stripe lock of a block group
|
||||||
|
*
|
||||||
|
* Return pointer to existing full stripe lock if found
|
||||||
|
* Return NULL if not found
|
||||||
|
*/
|
||||||
|
static struct full_stripe_lock *search_full_stripe_lock(
|
||||||
|
struct btrfs_full_stripe_locks_tree *locks_root,
|
||||||
|
u64 fstripe_logical)
|
||||||
|
{
|
||||||
|
struct rb_node *node;
|
||||||
|
struct full_stripe_lock *entry;
|
||||||
|
|
||||||
|
WARN_ON(!mutex_is_locked(&locks_root->lock));
|
||||||
|
|
||||||
|
node = locks_root->root.rb_node;
|
||||||
|
while (node) {
|
||||||
|
entry = rb_entry(node, struct full_stripe_lock, node);
|
||||||
|
if (fstripe_logical < entry->logical)
|
||||||
|
node = node->rb_left;
|
||||||
|
else if (fstripe_logical > entry->logical)
|
||||||
|
node = node->rb_right;
|
||||||
|
else
|
||||||
|
return entry;
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Helper to get full stripe logical from a normal bytenr.
|
||||||
|
*
|
||||||
|
* Caller must ensure @cache is a RAID56 block group.
|
||||||
|
*/
|
||||||
|
static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
|
||||||
|
u64 bytenr)
|
||||||
|
{
|
||||||
|
u64 ret;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Due to chunk item size limit, full stripe length should not be
|
||||||
|
* larger than U32_MAX. Just a sanity check here.
|
||||||
|
*/
|
||||||
|
WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* round_down() can only handle power of 2, while RAID56 full
|
||||||
|
* stripe length can be 64KiB * n, so we need to manually round down.
|
||||||
|
*/
|
||||||
|
ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
|
||||||
|
cache->full_stripe_len + cache->key.objectid;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Lock a full stripe to avoid concurrency of recovery and read
|
||||||
|
*
|
||||||
|
* It's only used for profiles with parities (RAID5/6), for other profiles it
|
||||||
|
* does nothing.
|
||||||
|
*
|
||||||
|
* Return 0 if we locked full stripe covering @bytenr, with a mutex held.
|
||||||
|
* So caller must call unlock_full_stripe() at the same context.
|
||||||
|
*
|
||||||
|
* Return <0 if encounters error.
|
||||||
|
*/
|
||||||
|
static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
|
||||||
|
bool *locked_ret)
|
||||||
|
{
|
||||||
|
struct btrfs_block_group_cache *bg_cache;
|
||||||
|
struct btrfs_full_stripe_locks_tree *locks_root;
|
||||||
|
struct full_stripe_lock *existing;
|
||||||
|
u64 fstripe_start;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
*locked_ret = false;
|
||||||
|
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
|
||||||
|
if (!bg_cache) {
|
||||||
|
ASSERT(0);
|
||||||
|
return -ENOENT;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Profiles not based on parity don't need full stripe lock */
|
||||||
|
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
|
||||||
|
goto out;
|
||||||
|
locks_root = &bg_cache->full_stripe_locks_root;
|
||||||
|
|
||||||
|
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
|
||||||
|
|
||||||
|
/* Now insert the full stripe lock */
|
||||||
|
mutex_lock(&locks_root->lock);
|
||||||
|
existing = insert_full_stripe_lock(locks_root, fstripe_start);
|
||||||
|
mutex_unlock(&locks_root->lock);
|
||||||
|
if (IS_ERR(existing)) {
|
||||||
|
ret = PTR_ERR(existing);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
mutex_lock(&existing->mutex);
|
||||||
|
*locked_ret = true;
|
||||||
|
out:
|
||||||
|
btrfs_put_block_group(bg_cache);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Unlock a full stripe.
|
||||||
|
*
|
||||||
|
* NOTE: Caller must ensure it's the same context calling corresponding
|
||||||
|
* lock_full_stripe().
|
||||||
|
*
|
||||||
|
* Return 0 if we unlock full stripe without problem.
|
||||||
|
* Return <0 for error
|
||||||
|
*/
|
||||||
|
static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
|
||||||
|
bool locked)
|
||||||
|
{
|
||||||
|
struct btrfs_block_group_cache *bg_cache;
|
||||||
|
struct btrfs_full_stripe_locks_tree *locks_root;
|
||||||
|
struct full_stripe_lock *fstripe_lock;
|
||||||
|
u64 fstripe_start;
|
||||||
|
bool freeit = false;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
/* If we didn't acquire full stripe lock, no need to continue */
|
||||||
|
if (!locked)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
|
||||||
|
if (!bg_cache) {
|
||||||
|
ASSERT(0);
|
||||||
|
return -ENOENT;
|
||||||
|
}
|
||||||
|
if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
|
||||||
|
goto out;
|
||||||
|
|
||||||
|
locks_root = &bg_cache->full_stripe_locks_root;
|
||||||
|
fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
|
||||||
|
|
||||||
|
mutex_lock(&locks_root->lock);
|
||||||
|
fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
|
||||||
|
/* Unpaired unlock_full_stripe() detected */
|
||||||
|
if (!fstripe_lock) {
|
||||||
|
WARN_ON(1);
|
||||||
|
ret = -ENOENT;
|
||||||
|
mutex_unlock(&locks_root->lock);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fstripe_lock->refs == 0) {
|
||||||
|
WARN_ON(1);
|
||||||
|
btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
|
||||||
|
fstripe_lock->logical);
|
||||||
|
} else {
|
||||||
|
fstripe_lock->refs--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fstripe_lock->refs == 0) {
|
||||||
|
rb_erase(&fstripe_lock->node, &locks_root->root);
|
||||||
|
freeit = true;
|
||||||
|
}
|
||||||
|
mutex_unlock(&locks_root->lock);
|
||||||
|
|
||||||
|
mutex_unlock(&fstripe_lock->mutex);
|
||||||
|
if (freeit)
|
||||||
|
kfree(fstripe_lock);
|
||||||
|
out:
|
||||||
|
btrfs_put_block_group(bg_cache);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* used for workers that require transaction commits (i.e., for the
|
* used for workers that require transaction commits (i.e., for the
|
||||||
* NOCOW case)
|
* NOCOW case)
|
||||||
|
|
Loading…
Reference in New Issue