Miscellaneous ext4 bug fixes for v5.12.

-----BEGIN PGP SIGNATURE-----
 
 iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAmBXj1oACgkQ8vlZVpUN
 gaNnAwgAqZJ0S/Hctexs+v+DNvuyMxsA84pB/9KYlK2zgbBOyK5Iftxjqxb9Sb6j
 6XKQOIaP2EXYJ0MDWW/fDMUHatlJvXUp+A9kLTiOLMDaRXbobQzb5jlGg9ZB/pBj
 TzISrR4widiqJbVT2RFpO9O7B75BQqlpqFNfkF/yJ9CU/ozAw9x+voPcZK7q8/Sh
 +DeQCARvgfx1ZipHGTYKjJdujA0qGcDfboYJpgId/gA5Zi76tx4NlbeXAM2QmRfh
 zAd1NzFhqf7JmKDAWDdUeRnrDHcje9FLcAxo7Quq7YWxRKFsOCz9LTxazL2UIoa2
 HvGpMD23qmISCLUyyrfnrpGPj/mD2w==
 =xcuH
 -----END PGP SIGNATURE-----

Merge tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4

Pull ext4 fixes from Ted Ts'o:
 "Miscellaneous ext4 bug fixes for v5.12"

* tag 'ext4_for_linus_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: initialize ret to suppress smatch warning
  ext4: stop inode update before return
  ext4: fix rename whiteout with fast commit
  ext4: fix timer use-after-free on failed mount
  ext4: fix potential error in ext4_do_update_inode
  ext4: do not try to set xattr into ea_inode if value is empty
  ext4: do not iput inode under running transaction in ext4_rename()
  ext4: find old entry again if failed to rename whiteout
  ext4: fix error handling in ext4_end_enable_verity()
  ext4: fix bh ref count on error paths
  fs/ext4: fix integer overflow in s_log_groups_per_flex
  ext4: add reclaim checks to xattr code
  ext4: shrink race window in ext4_should_retry_alloc()
This commit is contained in:
Linus Torvalds 2021-03-21 14:06:10 -07:00
commit d7f5f1bd3c
11 changed files with 170 additions and 74 deletions

View File

@ -626,27 +626,41 @@ int ext4_claim_free_clusters(struct ext4_sb_info *sbi,
/**
* ext4_should_retry_alloc() - check if a block allocation should be retried
* @sb: super block
* @retries: number of attemps has been made
* @sb: superblock
* @retries: number of retry attempts made so far
*
* ext4_should_retry_alloc() is called when ENOSPC is returned, and if
* it is profitable to retry the operation, this function will wait
* for the current or committing transaction to complete, and then
* return TRUE. We will only retry once.
* ext4_should_retry_alloc() is called when ENOSPC is returned while
* attempting to allocate blocks. If there's an indication that a pending
* journal transaction might free some space and allow another attempt to
* succeed, this function will wait for the current or committing transaction
* to complete and then return TRUE.
*/
int ext4_should_retry_alloc(struct super_block *sb, int *retries)
{
if (!ext4_has_free_clusters(EXT4_SB(sb), 1, 0) ||
(*retries)++ > 1 ||
!EXT4_SB(sb)->s_journal)
struct ext4_sb_info *sbi = EXT4_SB(sb);
if (!sbi->s_journal)
return 0;
if (++(*retries) > 3) {
percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit);
return 0;
}
/*
* if there's no indication that blocks are about to be freed it's
* possible we just missed a transaction commit that did so
*/
smp_mb();
if (EXT4_SB(sb)->s_mb_free_pending == 0)
return 0;
if (sbi->s_mb_free_pending == 0)
return ext4_has_free_clusters(sbi, 1, 0);
/*
* it's possible we've just missed a transaction commit here,
* so ignore the returned status
*/
jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
(void) jbd2_journal_force_commit_nested(sbi->s_journal);
return 1;
}

View File

@ -1484,6 +1484,7 @@ struct ext4_sb_info {
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyclusters_counter;
struct percpu_counter s_sra_exceeded_retry_limit;
struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
struct kobject s_kobj;
@ -2793,6 +2794,8 @@ void __ext4_fc_track_link(handle_t *handle, struct inode *inode,
struct dentry *dentry);
void ext4_fc_track_unlink(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_link(handle_t *handle, struct dentry *dentry);
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
struct dentry *dentry);
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason);

View File

@ -4382,7 +4382,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
{
struct inode *inode = file_inode(file);
handle_t *handle;
int ret, ret2 = 0, ret3 = 0;
int ret = 0, ret2 = 0, ret3 = 0;
int retries = 0;
int depth = 0;
struct ext4_map_blocks map;

View File

@ -513,10 +513,10 @@ void ext4_fc_track_link(handle_t *handle, struct dentry *dentry)
__ext4_fc_track_link(handle, d_inode(dentry), dentry);
}
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
struct dentry *dentry)
{
struct __track_dentry_update_args args;
struct inode *inode = d_inode(dentry);
int ret;
args.dentry = dentry;
@ -527,6 +527,11 @@ void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
trace_ext4_fc_track_create(inode, dentry, ret);
}
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry)
{
__ext4_fc_track_create(handle, d_inode(dentry), dentry);
}
/* __track_fn for inode tracking */
static int __track_inode(struct inode *inode, void *arg, bool update)
{

View File

@ -1938,13 +1938,13 @@ static int __ext4_journalled_writepage(struct page *page,
if (!ret)
ret = err;
if (!ext4_has_inline_data(inode))
ext4_walk_page_buffers(NULL, page_bufs, 0, len,
NULL, bput_one);
ext4_set_inode_state(inode, EXT4_STATE_JDATA);
out:
unlock_page(page);
out_no_pagelock:
if (!inline_data && page_bufs)
ext4_walk_page_buffers(NULL, page_bufs, 0, len,
NULL, bput_one);
brelse(inode_bh);
return ret;
}
@ -5026,7 +5026,7 @@ static int ext4_do_update_inode(handle_t *handle,
struct ext4_inode_info *ei = EXT4_I(inode);
struct buffer_head *bh = iloc->bh;
struct super_block *sb = inode->i_sb;
int err = 0, rc, block;
int err = 0, block;
int need_datasync = 0, set_large_file = 0;
uid_t i_uid;
gid_t i_gid;
@ -5138,9 +5138,9 @@ static int ext4_do_update_inode(handle_t *handle,
bh->b_data);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
rc = ext4_handle_dirty_metadata(handle, NULL, bh);
if (!err)
err = rc;
err = ext4_handle_dirty_metadata(handle, NULL, bh);
if (err)
goto out_brelse;
ext4_clear_inode_state(inode, EXT4_STATE_NEW);
if (set_large_file) {
BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get write access");
@ -5387,8 +5387,10 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
inode->i_gid = attr->ia_gid;
error = ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
if (unlikely(error))
if (unlikely(error)) {
ext4_fc_stop_update(inode);
return error;
}
}
if (attr->ia_valid & ATTR_SIZE) {

View File

@ -2709,8 +2709,15 @@ static int ext4_mb_init_backend(struct super_block *sb)
}
if (ext4_has_feature_flex_bg(sb)) {
/* a single flex group is supposed to be read by a single IO */
sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex,
/* a single flex group is supposed to be read by a single IO.
* 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is
* unsigned integer, so the maximum shift is 32.
*/
if (sbi->s_es->s_log_groups_per_flex >= 32) {
ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group");
goto err_freesgi;
}
sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex,
BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9));
sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */
} else {

View File

@ -3613,6 +3613,31 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
return retval;
}
static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
unsigned ino, unsigned file_type)
{
struct ext4_renament old = *ent;
int retval = 0;
/*
* old->de could have moved from under us during make indexed dir,
* so the old->de may no longer valid and need to find it again
* before reset old inode info.
*/
old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
if (IS_ERR(old.bh))
retval = PTR_ERR(old.bh);
if (!old.bh)
retval = -ENOENT;
if (retval) {
ext4_std_error(old.dir->i_sb, retval);
return;
}
ext4_setent(handle, &old, ino, file_type);
brelse(old.bh);
}
static int ext4_find_delete_entry(handle_t *handle, struct inode *dir,
const struct qstr *d_name)
{
@ -3774,14 +3799,14 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
*/
retval = -ENOENT;
if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
goto end_rename;
goto release_bh;
new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
&new.de, &new.inlined);
if (IS_ERR(new.bh)) {
retval = PTR_ERR(new.bh);
new.bh = NULL;
goto end_rename;
goto release_bh;
}
if (new.bh) {
if (!new.inode) {
@ -3798,15 +3823,13 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits);
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
handle = NULL;
goto end_rename;
goto release_bh;
}
} else {
whiteout = ext4_whiteout_for_rename(mnt_userns, &old, credits, &handle);
if (IS_ERR(whiteout)) {
retval = PTR_ERR(whiteout);
whiteout = NULL;
goto end_rename;
goto release_bh;
}
}
@ -3850,6 +3873,7 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
retval = ext4_mark_inode_dirty(handle, whiteout);
if (unlikely(retval))
goto end_rename;
}
if (!new.bh) {
retval = ext4_add_entry(handle, new.dentry, old.inode);
@ -3923,6 +3947,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
ext4_fc_track_unlink(handle, new.dentry);
__ext4_fc_track_link(handle, old.inode, new.dentry);
__ext4_fc_track_unlink(handle, old.inode, old.dentry);
if (whiteout)
__ext4_fc_track_create(handle, whiteout, old.dentry);
}
if (new.inode) {
@ -3937,19 +3963,21 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
end_rename:
if (whiteout) {
if (retval) {
ext4_setent(handle, &old,
old.inode->i_ino, old_file_type);
ext4_resetent(handle, &old,
old.inode->i_ino, old_file_type);
drop_nlink(whiteout);
ext4_orphan_add(handle, whiteout);
}
unlock_new_inode(whiteout);
ext4_journal_stop(handle);
iput(whiteout);
} else {
ext4_journal_stop(handle);
}
release_bh:
brelse(old.dir_bh);
brelse(old.bh);
brelse(new.bh);
if (handle)
ext4_journal_stop(handle);
return retval;
}

View File

@ -1210,6 +1210,7 @@ static void ext4_put_super(struct super_block *sb)
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
@ -5011,6 +5012,9 @@ no_journal:
if (!err)
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
if (!err)
err = percpu_counter_init(&sbi->s_sra_exceeded_retry_limit, 0,
GFP_KERNEL);
if (!err)
err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
@ -5124,6 +5128,7 @@ failed_mount6:
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
percpu_counter_destroy(&sbi->s_sra_exceeded_retry_limit);
percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
ext4_ext_release(sb);
@ -5149,8 +5154,8 @@ failed_mount_wq:
failed_mount3a:
ext4_es_unregister_shrinker(sbi);
failed_mount3:
del_timer_sync(&sbi->s_err_report);
flush_work(&sbi->s_error_work);
del_timer_sync(&sbi->s_err_report);
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:

View File

@ -24,6 +24,7 @@ typedef enum {
attr_session_write_kbytes,
attr_lifetime_write_kbytes,
attr_reserved_clusters,
attr_sra_exceeded_retry_limit,
attr_inode_readahead,
attr_trigger_test_error,
attr_first_error_time,
@ -202,6 +203,7 @@ EXT4_ATTR_FUNC(delayed_allocation_blocks, 0444);
EXT4_ATTR_FUNC(session_write_kbytes, 0444);
EXT4_ATTR_FUNC(lifetime_write_kbytes, 0444);
EXT4_ATTR_FUNC(reserved_clusters, 0644);
EXT4_ATTR_FUNC(sra_exceeded_retry_limit, 0444);
EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, inode_readahead,
ext4_sb_info, s_inode_readahead_blks);
@ -251,6 +253,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(session_write_kbytes),
ATTR_LIST(lifetime_write_kbytes),
ATTR_LIST(reserved_clusters),
ATTR_LIST(sra_exceeded_retry_limit),
ATTR_LIST(inode_readahead_blks),
ATTR_LIST(inode_goal),
ATTR_LIST(mb_stats),
@ -374,6 +377,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)
atomic64_read(&sbi->s_resv_clusters));
case attr_sra_exceeded_retry_limit:
return snprintf(buf, PAGE_SIZE, "%llu\n",
(unsigned long long)
percpu_counter_sum(&sbi->s_sra_exceeded_retry_limit));
case attr_inode_readahead:
case attr_pointer_ui:
if (!ptr)

View File

@ -201,55 +201,76 @@ static int ext4_end_enable_verity(struct file *filp, const void *desc,
struct inode *inode = file_inode(filp);
const int credits = 2; /* superblock and inode for ext4_orphan_del() */
handle_t *handle;
struct ext4_iloc iloc;
int err = 0;
int err2;
if (desc != NULL) {
/* Succeeded; write the verity descriptor. */
err = ext4_write_verity_descriptor(inode, desc, desc_size,
merkle_tree_size);
/* Write all pages before clearing VERITY_IN_PROGRESS. */
if (!err)
err = filemap_write_and_wait(inode->i_mapping);
}
/* If we failed, truncate anything we wrote past i_size. */
if (desc == NULL || err)
ext4_truncate(inode);
/*
* We must always clean up by clearing EXT4_STATE_VERITY_IN_PROGRESS and
* deleting the inode from the orphan list, even if something failed.
* If everything succeeded, we'll also set the verity bit in the same
* transaction.
* If an error already occurred (which fs/verity/ signals by passing
* desc == NULL), then only clean-up is needed.
*/
if (desc == NULL)
goto cleanup;
ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
/* Append the verity descriptor. */
err = ext4_write_verity_descriptor(inode, desc, desc_size,
merkle_tree_size);
if (err)
goto cleanup;
/*
* Write all pages (both data and verity metadata). Note that this must
* happen before clearing EXT4_STATE_VERITY_IN_PROGRESS; otherwise pages
* beyond i_size won't be written properly. For crash consistency, this
* also must happen before the verity inode flag gets persisted.
*/
err = filemap_write_and_wait(inode->i_mapping);
if (err)
goto cleanup;
/*
* Finally, set the verity inode flag and remove the inode from the
* orphan list (in a single transaction).
*/
handle = ext4_journal_start(inode, EXT4_HT_INODE, credits);
if (IS_ERR(handle)) {
ext4_orphan_del(NULL, inode);
return PTR_ERR(handle);
err = PTR_ERR(handle);
goto cleanup;
}
err2 = ext4_orphan_del(handle, inode);
if (err2)
goto out_stop;
err = ext4_orphan_del(handle, inode);
if (err)
goto stop_and_cleanup;
if (desc != NULL && !err) {
struct ext4_iloc iloc;
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto stop_and_cleanup;
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
ext4_set_inode_flags(inode, false);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (err)
goto stop_and_cleanup;
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto out_stop;
ext4_set_inode_flag(inode, EXT4_INODE_VERITY);
ext4_set_inode_flags(inode, false);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
}
out_stop:
ext4_journal_stop(handle);
return err ?: err2;
ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
return 0;
stop_and_cleanup:
ext4_journal_stop(handle);
cleanup:
/*
* Verity failed to be enabled, so clean up by truncating any verity
* metadata that was written beyond i_size (both from cache and from
* disk), removing the inode from the orphan list (if it wasn't done
* already), and clearing EXT4_STATE_VERITY_IN_PROGRESS.
*/
truncate_inode_pages(inode->i_mapping, inode->i_size);
ext4_truncate(inode);
ext4_orphan_del(NULL, inode);
ext4_clear_inode_state(inode, EXT4_STATE_VERITY_IN_PROGRESS);
return err;
}
static int ext4_get_verity_descriptor_location(struct inode *inode,

View File

@ -1462,6 +1462,9 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
if (!ce)
return NULL;
WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) &&
!(current->flags & PF_MEMALLOC_NOFS));
ea_data = kvmalloc(value_len, GFP_KERNEL);
if (!ea_data) {
mb_cache_entry_put(ea_inode_cache, ce);
@ -2327,6 +2330,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
error = -ENOSPC;
goto cleanup;
}
WARN_ON_ONCE(!(current->flags & PF_MEMALLOC_NOFS));
}
error = ext4_reserve_inode_write(handle, inode, &is.iloc);
@ -2400,7 +2404,7 @@ retry_inode:
* external inode if possible.
*/
if (ext4_has_feature_ea_inode(inode->i_sb) &&
!i.in_inode) {
i.value_len && !i.in_inode) {
i.in_inode = 1;
goto retry_inode;
}