ext4: mballoc: add blocks to PA list under same spinlock after allocating blocks
ext4_mb_discard_preallocations() only checks for grp->bb_prealloc_list of every group to discard the group's PA to free up the space if allocation request fails. Consider below race:- Process A Process B 1. allocate blocks 1. Fails block allocation from ext4_mb_regular_allocator() ext4_lock_group() allocated blocks more than ac_o_ex.fe_len ext4_unlock_group() 2. Scans the grp->bb_prealloc_list (under ext4_lock_group()) and find nothing and thus return -ENOSPC. 2. Add the additional blocks to PA list ext4_lock_group() add blocks to grp->bb_prealloc_list ext4_unlock_group() Above race could be avoided if we add those additional blocks to grp->bb_prealloc_list at the same time with block allocation when ext4_lock_group() was still held. With this discard-PA will know if there are actually any blocks which could be freed from the PA Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com> Link: https://lore.kernel.org/r/a2217dd782585b42328981832e6d396abaaccb80.1589955723.git.riteshh@linux.ibm.com Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
parent
de8ff14cab
commit
53f86b170d
|
@ -349,6 +349,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
|
||||||
ext4_group_t group);
|
ext4_group_t group);
|
||||||
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
|
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
|
||||||
ext4_group_t group);
|
ext4_group_t group);
|
||||||
|
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
|
||||||
|
|
||||||
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
|
static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
|
||||||
{
|
{
|
||||||
|
@ -1701,6 +1702,14 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
|
||||||
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
|
sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
|
||||||
spin_unlock(&sbi->s_md_lock);
|
spin_unlock(&sbi->s_md_lock);
|
||||||
}
|
}
|
||||||
|
/*
|
||||||
|
* As we've just preallocated more space than
|
||||||
|
* user requested originally, we store allocated
|
||||||
|
* space in a special descriptor.
|
||||||
|
*/
|
||||||
|
if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
|
||||||
|
ext4_mb_new_preallocation(ac);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1949,7 +1958,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
|
||||||
|
|
||||||
ext4_mb_use_best_found(ac, e4b);
|
ext4_mb_use_best_found(ac, e4b);
|
||||||
|
|
||||||
BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len);
|
BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
|
||||||
|
|
||||||
if (EXT4_SB(sb)->s_mb_stats)
|
if (EXT4_SB(sb)->s_mb_stats)
|
||||||
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
|
atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
|
||||||
|
@ -3675,7 +3684,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
|
||||||
/*
|
/*
|
||||||
* creates new preallocated space for given inode
|
* creates new preallocated space for given inode
|
||||||
*/
|
*/
|
||||||
static noinline_for_stack int
|
static noinline_for_stack void
|
||||||
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
|
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
|
||||||
{
|
{
|
||||||
struct super_block *sb = ac->ac_sb;
|
struct super_block *sb = ac->ac_sb;
|
||||||
|
@ -3688,10 +3697,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
|
||||||
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
|
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
|
||||||
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
|
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
|
||||||
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
|
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
|
||||||
|
BUG_ON(ac->ac_pa == NULL);
|
||||||
|
|
||||||
pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
|
pa = ac->ac_pa;
|
||||||
if (pa == NULL)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
|
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
|
||||||
int winl;
|
int winl;
|
||||||
|
@ -3735,7 +3743,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
|
||||||
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
|
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
|
||||||
pa->pa_len = ac->ac_b_ex.fe_len;
|
pa->pa_len = ac->ac_b_ex.fe_len;
|
||||||
pa->pa_free = pa->pa_len;
|
pa->pa_free = pa->pa_len;
|
||||||
atomic_set(&pa->pa_count, 1);
|
|
||||||
spin_lock_init(&pa->pa_lock);
|
spin_lock_init(&pa->pa_lock);
|
||||||
INIT_LIST_HEAD(&pa->pa_inode_list);
|
INIT_LIST_HEAD(&pa->pa_inode_list);
|
||||||
INIT_LIST_HEAD(&pa->pa_group_list);
|
INIT_LIST_HEAD(&pa->pa_group_list);
|
||||||
|
@ -3755,21 +3762,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
|
||||||
pa->pa_obj_lock = &ei->i_prealloc_lock;
|
pa->pa_obj_lock = &ei->i_prealloc_lock;
|
||||||
pa->pa_inode = ac->ac_inode;
|
pa->pa_inode = ac->ac_inode;
|
||||||
|
|
||||||
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
|
|
||||||
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
|
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
|
||||||
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
|
|
||||||
|
|
||||||
spin_lock(pa->pa_obj_lock);
|
spin_lock(pa->pa_obj_lock);
|
||||||
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
|
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
|
||||||
spin_unlock(pa->pa_obj_lock);
|
spin_unlock(pa->pa_obj_lock);
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* creates new preallocated space for locality group inodes belongs to
|
* creates new preallocated space for locality group inodes belongs to
|
||||||
*/
|
*/
|
||||||
static noinline_for_stack int
|
static noinline_for_stack void
|
||||||
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
||||||
{
|
{
|
||||||
struct super_block *sb = ac->ac_sb;
|
struct super_block *sb = ac->ac_sb;
|
||||||
|
@ -3781,11 +3784,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
||||||
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
|
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
|
||||||
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
|
BUG_ON(ac->ac_status != AC_STATUS_FOUND);
|
||||||
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
|
BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
|
||||||
|
BUG_ON(ac->ac_pa == NULL);
|
||||||
|
|
||||||
BUG_ON(ext4_pspace_cachep == NULL);
|
pa = ac->ac_pa;
|
||||||
pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
|
|
||||||
if (pa == NULL)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
/* preallocation can change ac_b_ex, thus we store actually
|
/* preallocation can change ac_b_ex, thus we store actually
|
||||||
* allocated blocks for history */
|
* allocated blocks for history */
|
||||||
|
@ -3795,7 +3796,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
||||||
pa->pa_lstart = pa->pa_pstart;
|
pa->pa_lstart = pa->pa_pstart;
|
||||||
pa->pa_len = ac->ac_b_ex.fe_len;
|
pa->pa_len = ac->ac_b_ex.fe_len;
|
||||||
pa->pa_free = pa->pa_len;
|
pa->pa_free = pa->pa_len;
|
||||||
atomic_set(&pa->pa_count, 1);
|
|
||||||
spin_lock_init(&pa->pa_lock);
|
spin_lock_init(&pa->pa_lock);
|
||||||
INIT_LIST_HEAD(&pa->pa_inode_list);
|
INIT_LIST_HEAD(&pa->pa_inode_list);
|
||||||
INIT_LIST_HEAD(&pa->pa_group_list);
|
INIT_LIST_HEAD(&pa->pa_group_list);
|
||||||
|
@ -3816,26 +3816,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
|
||||||
pa->pa_obj_lock = &lg->lg_prealloc_lock;
|
pa->pa_obj_lock = &lg->lg_prealloc_lock;
|
||||||
pa->pa_inode = NULL;
|
pa->pa_inode = NULL;
|
||||||
|
|
||||||
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
|
|
||||||
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
|
list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
|
||||||
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We will later add the new pa to the right bucket
|
* We will later add the new pa to the right bucket
|
||||||
* after updating the pa_free in ext4_mb_release_context
|
* after updating the pa_free in ext4_mb_release_context
|
||||||
*/
|
*/
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
|
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
|
||||||
{
|
{
|
||||||
int err;
|
|
||||||
|
|
||||||
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
|
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
|
||||||
err = ext4_mb_new_group_pa(ac);
|
ext4_mb_new_group_pa(ac);
|
||||||
else
|
else
|
||||||
err = ext4_mb_new_inode_pa(ac);
|
ext4_mb_new_inode_pa(ac);
|
||||||
return err;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -4150,6 +4144,29 @@ repeat:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
|
||||||
|
{
|
||||||
|
struct ext4_prealloc_space *pa;
|
||||||
|
|
||||||
|
BUG_ON(ext4_pspace_cachep == NULL);
|
||||||
|
pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
|
||||||
|
if (!pa)
|
||||||
|
return -ENOMEM;
|
||||||
|
atomic_set(&pa->pa_count, 1);
|
||||||
|
ac->ac_pa = pa;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
|
||||||
|
{
|
||||||
|
struct ext4_prealloc_space *pa = ac->ac_pa;
|
||||||
|
|
||||||
|
BUG_ON(!pa);
|
||||||
|
ac->ac_pa = NULL;
|
||||||
|
WARN_ON(!atomic_dec_and_test(&pa->pa_count));
|
||||||
|
kmem_cache_free(ext4_pspace_cachep, pa);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_EXT4_DEBUG
|
#ifdef CONFIG_EXT4_DEBUG
|
||||||
static inline void ext4_mb_show_pa(struct super_block *sb)
|
static inline void ext4_mb_show_pa(struct super_block *sb)
|
||||||
{
|
{
|
||||||
|
@ -4606,23 +4623,28 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
|
||||||
if (!ext4_mb_use_preallocated(ac)) {
|
if (!ext4_mb_use_preallocated(ac)) {
|
||||||
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
|
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
|
||||||
ext4_mb_normalize_request(ac, ar);
|
ext4_mb_normalize_request(ac, ar);
|
||||||
|
|
||||||
|
*errp = ext4_mb_pa_alloc(ac);
|
||||||
|
if (*errp)
|
||||||
|
goto errout;
|
||||||
repeat:
|
repeat:
|
||||||
/* allocate space in core */
|
/* allocate space in core */
|
||||||
*errp = ext4_mb_regular_allocator(ac);
|
*errp = ext4_mb_regular_allocator(ac);
|
||||||
if (*errp)
|
/*
|
||||||
goto discard_and_exit;
|
* pa allocated above is added to grp->bb_prealloc_list only
|
||||||
|
* when we were able to allocate some block i.e. when
|
||||||
/* as we've just preallocated more space than
|
* ac->ac_status == AC_STATUS_FOUND.
|
||||||
* user requested originally, we store allocated
|
* And error from above mean ac->ac_status != AC_STATUS_FOUND
|
||||||
* space in a special descriptor */
|
* So we have to free this pa here itself.
|
||||||
if (ac->ac_status == AC_STATUS_FOUND &&
|
*/
|
||||||
ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
|
|
||||||
*errp = ext4_mb_new_preallocation(ac);
|
|
||||||
if (*errp) {
|
if (*errp) {
|
||||||
discard_and_exit:
|
ext4_mb_pa_free(ac);
|
||||||
ext4_discard_allocated_blocks(ac);
|
ext4_discard_allocated_blocks(ac);
|
||||||
goto errout;
|
goto errout;
|
||||||
}
|
}
|
||||||
|
if (ac->ac_status == AC_STATUS_FOUND &&
|
||||||
|
ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
|
||||||
|
ext4_mb_pa_free(ac);
|
||||||
}
|
}
|
||||||
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
|
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
|
||||||
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
|
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
|
||||||
|
@ -4637,6 +4659,11 @@ repeat:
|
||||||
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
|
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
|
||||||
if (freed)
|
if (freed)
|
||||||
goto repeat;
|
goto repeat;
|
||||||
|
/*
|
||||||
|
* If block allocation fails then the pa allocated above
|
||||||
|
* needs to be freed here itself.
|
||||||
|
*/
|
||||||
|
ext4_mb_pa_free(ac);
|
||||||
*errp = -ENOSPC;
|
*errp = -ENOSPC;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue