ext4: mballoc: add blocks to PA list under same spinlock after allocating blocks

ext4_mb_discard_preallocations() only checks for grp->bb_prealloc_list
of every group to discard the group's PA to free up the space if
allocation request fails. Consider below race:-

Process A  				Process B

1. allocate blocks
					1. Fails block allocation from
					     ext4_mb_regular_allocator()
   ext4_lock_group()
	allocated blocks
	more than ac_o_ex.fe_len
   ext4_unlock_group()
					2. Scans the
					   grp->bb_prealloc_list (under
					   ext4_lock_group()) and
					   find nothing and thus return
					   -ENOSPC.

2. Add the additional blocks to PA list

   ext4_lock_group()
   	add blocks to grp->bb_prealloc_list
   ext4_unlock_group()

Above race could be avoided if we add those additional blocks to
grp->bb_prealloc_list at the same time with block allocation when
ext4_lock_group() was still held.
With this discard-PA will know if there are actually any blocks which
could be freed from the PA

Signed-off-by: Ritesh Harjani <riteshh@linux.ibm.com>
Link: https://lore.kernel.org/r/a2217dd782585b42328981832e6d396abaaccb80.1589955723.git.riteshh@linux.ibm.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
This commit is contained in:
Ritesh Harjani 2020-05-20 12:10:32 +05:30 committed by Theodore Ts'o
parent de8ff14cab
commit 53f86b170d
1 changed files with 62 additions and 35 deletions

View File

@ -349,6 +349,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group); ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group); ext4_group_t group);
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
static inline void *mb_correct_addr_and_bit(int *bit, void *addr) static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
{ {
@ -1701,6 +1702,14 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
sbi->s_mb_last_start = ac->ac_f_ex.fe_start; sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
spin_unlock(&sbi->s_md_lock); spin_unlock(&sbi->s_md_lock);
} }
/*
* As we've just preallocated more space than
* user requested originally, we store allocated
* space in a special descriptor.
*/
if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
ext4_mb_new_preallocation(ac);
} }
/* /*
@ -1949,7 +1958,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b); ext4_mb_use_best_found(ac, e4b);
BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len);
if (EXT4_SB(sb)->s_mb_stats) if (EXT4_SB(sb)->s_mb_stats)
atomic_inc(&EXT4_SB(sb)->s_bal_2orders); atomic_inc(&EXT4_SB(sb)->s_bal_2orders);
@ -3675,7 +3684,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
/* /*
* creates new preallocated space for given inode * creates new preallocated space for given inode
*/ */
static noinline_for_stack int static noinline_for_stack void
ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
{ {
struct super_block *sb = ac->ac_sb; struct super_block *sb = ac->ac_sb;
@ -3688,10 +3697,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
BUG_ON(ac->ac_pa == NULL);
pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); pa = ac->ac_pa;
if (pa == NULL)
return -ENOMEM;
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
int winl; int winl;
@ -3735,7 +3743,6 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len; pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock); spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list); INIT_LIST_HEAD(&pa->pa_group_list);
@ -3755,21 +3762,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa->pa_obj_lock = &ei->i_prealloc_lock; pa->pa_obj_lock = &ei->i_prealloc_lock;
pa->pa_inode = ac->ac_inode; pa->pa_inode = ac->ac_inode;
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list); list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
spin_lock(pa->pa_obj_lock); spin_lock(pa->pa_obj_lock);
list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list);
spin_unlock(pa->pa_obj_lock); spin_unlock(pa->pa_obj_lock);
return 0;
} }
/* /*
* creates new preallocated space for locality group inodes belongs to * creates new preallocated space for locality group inodes belongs to
*/ */
static noinline_for_stack int static noinline_for_stack void
ext4_mb_new_group_pa(struct ext4_allocation_context *ac) ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
{ {
struct super_block *sb = ac->ac_sb; struct super_block *sb = ac->ac_sb;
@ -3781,11 +3784,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len);
BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(ac->ac_status != AC_STATUS_FOUND);
BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); BUG_ON(!S_ISREG(ac->ac_inode->i_mode));
BUG_ON(ac->ac_pa == NULL);
BUG_ON(ext4_pspace_cachep == NULL); pa = ac->ac_pa;
pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS);
if (pa == NULL)
return -ENOMEM;
/* preallocation can change ac_b_ex, thus we store actually /* preallocation can change ac_b_ex, thus we store actually
* allocated blocks for history */ * allocated blocks for history */
@ -3795,7 +3796,6 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_lstart = pa->pa_pstart; pa->pa_lstart = pa->pa_pstart;
pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_len = ac->ac_b_ex.fe_len;
pa->pa_free = pa->pa_len; pa->pa_free = pa->pa_len;
atomic_set(&pa->pa_count, 1);
spin_lock_init(&pa->pa_lock); spin_lock_init(&pa->pa_lock);
INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list); INIT_LIST_HEAD(&pa->pa_group_list);
@ -3816,26 +3816,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
pa->pa_obj_lock = &lg->lg_prealloc_lock; pa->pa_obj_lock = &lg->lg_prealloc_lock;
pa->pa_inode = NULL; pa->pa_inode = NULL;
ext4_lock_group(sb, ac->ac_b_ex.fe_group);
list_add(&pa->pa_group_list, &grp->bb_prealloc_list); list_add(&pa->pa_group_list, &grp->bb_prealloc_list);
ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
/* /*
* We will later add the new pa to the right bucket * We will later add the new pa to the right bucket
* after updating the pa_free in ext4_mb_release_context * after updating the pa_free in ext4_mb_release_context
*/ */
return 0;
} }
static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
{ {
int err;
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
err = ext4_mb_new_group_pa(ac); ext4_mb_new_group_pa(ac);
else else
err = ext4_mb_new_inode_pa(ac); ext4_mb_new_inode_pa(ac);
return err;
} }
/* /*
@ -4150,6 +4144,29 @@ repeat:
} }
} }
static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa;
BUG_ON(ext4_pspace_cachep == NULL);
pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS);
if (!pa)
return -ENOMEM;
atomic_set(&pa->pa_count, 1);
ac->ac_pa = pa;
return 0;
}
static void ext4_mb_pa_free(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
BUG_ON(!pa);
ac->ac_pa = NULL;
WARN_ON(!atomic_dec_and_test(&pa->pa_count));
kmem_cache_free(ext4_pspace_cachep, pa);
}
#ifdef CONFIG_EXT4_DEBUG #ifdef CONFIG_EXT4_DEBUG
static inline void ext4_mb_show_pa(struct super_block *sb) static inline void ext4_mb_show_pa(struct super_block *sb)
{ {
@ -4606,23 +4623,28 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
if (!ext4_mb_use_preallocated(ac)) { if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC; ac->ac_op = EXT4_MB_HISTORY_ALLOC;
ext4_mb_normalize_request(ac, ar); ext4_mb_normalize_request(ac, ar);
*errp = ext4_mb_pa_alloc(ac);
if (*errp)
goto errout;
repeat: repeat:
/* allocate space in core */ /* allocate space in core */
*errp = ext4_mb_regular_allocator(ac); *errp = ext4_mb_regular_allocator(ac);
if (*errp) /*
goto discard_and_exit; * pa allocated above is added to grp->bb_prealloc_list only
* when we were able to allocate some block i.e. when
/* as we've just preallocated more space than * ac->ac_status == AC_STATUS_FOUND.
* user requested originally, we store allocated * And error from above mean ac->ac_status != AC_STATUS_FOUND
* space in a special descriptor */ * So we have to free this pa here itself.
if (ac->ac_status == AC_STATUS_FOUND && */
ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
*errp = ext4_mb_new_preallocation(ac);
if (*errp) { if (*errp) {
discard_and_exit: ext4_mb_pa_free(ac);
ext4_discard_allocated_blocks(ac); ext4_discard_allocated_blocks(ac);
goto errout; goto errout;
} }
if (ac->ac_status == AC_STATUS_FOUND &&
ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
ext4_mb_pa_free(ac);
} }
if (likely(ac->ac_status == AC_STATUS_FOUND)) { if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@ -4637,6 +4659,11 @@ repeat:
freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len);
if (freed) if (freed)
goto repeat; goto repeat;
/*
* If block allocation fails then the pa allocated above
* needs to be freed here itself.
*/
ext4_mb_pa_free(ac);
*errp = -ENOSPC; *errp = -ENOSPC;
} }