From 666525dfbdca09bbd4848ac711e4a4dbd6921325 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Mon, 7 Apr 2014 10:18:56 -0400 Subject: [PATCH 01/31] ext4: fix 64-bit number truncation warning '0x7FDEADBEEF' will be truncated to 32-bit number under unicore32. Need append 'ULL' for it. The related warning (with allmodconfig under unicore32): CC [M] fs/ext4/extents_status.o fs/ext4/extents_status.c: In function "__es_remove_extent": fs/ext4/extents_status.c:813: warning: integer constant is too large for "long" type Signed-off-by: Chen Gang Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents_status.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0a014a7194b2..0ebc21204b51 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -810,7 +810,7 @@ retry: newes.es_lblk = end + 1; newes.es_len = len2; - block = 0x7FDEADBEEF; + block = 0x7FDEADBEEFULL; if (ext4_es_is_written(&orig_es) || ext4_es_is_unwritten(&orig_es)) block = ext4_es_pblock(&orig_es) + From 4adb6ab3e0fa71363a5ef229544b2d17de6600d7 Mon Sep 17 00:00:00 2001 From: Kazuya Mio Date: Mon, 7 Apr 2014 10:53:28 -0400 Subject: [PATCH 02/31] ext4: FIBMAP ioctl causes BUG_ON due to handle EXT_MAX_BLOCKS When we try to get 2^32-1 block of the file which has the extent (ee_block=2^32-2, ee_len=1) with FIBMAP ioctl, it causes BUG_ON in ext4_ext_put_gap_in_cache(). To avoid the problem, ext4_map_blocks() needs to check the file logical block number. ext4_ext_put_gap_in_cache() called via ext4_map_blocks() cannot handle 2^32-1 because the maximum file logical block number is 2^32-2. Note that ext4_ind_map_blocks() returns -EIO when the block number is invalid. So ext4_map_blocks() should also return the same errno. Signed-off-by: Kazuya Mio Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/inode.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b0d2c7d5408..93f16c5e8a8e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -522,6 +522,10 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_len > INT_MAX)) map->m_len = INT_MAX; + /* We can handle the block number less than EXT_MAX_BLOCKS */ + if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) + return -EIO; + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { ext4_es_lru_add(inode); From 007649375f6af242d5b1df2c15996949714303ba Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 7 Apr 2014 10:54:20 -0400 Subject: [PATCH 03/31] ext4: initialize multi-block allocator before checking block descriptors With EXT4FS_DEBUG ext4_count_free_clusters() will call ext4_read_block_bitmap() without s_group_info initialized, so we need to initialize multi-block allocator before. And dependencies that must be solved, to allow this: - multi-block allocator needs in group descriptors - need to install s_op before initializing multi-block allocator, because in ext4_mb_init_backend() new inode is created. - initialize number of group desc blocks (s_gdb_count) otherwise number of clusters returned by ext4_free_clusters_after_init() is not correct. (see ext4_bg_num_gdb_nometa()) Here is the stack backtrace: (gdb) bt #0 ext4_get_group_info (group=0, sb=0xffff880079a10000) at ext4.h:2430 #1 ext4_validate_block_bitmap (sb=sb@entry=0xffff880079a10000, desc=desc@entry=0xffff880056510000, block_group=block_group@entry=0, bh=bh@entry=0xffff88007bf2b2d8) at balloc.c:358 #2 0xffffffff81232202 in ext4_wait_block_bitmap (sb=sb@entry=0xffff880079a10000, block_group=block_group@entry=0, bh=bh@entry=0xffff88007bf2b2d8) at balloc.c:476 #3 0xffffffff81232eaf in ext4_read_block_bitmap (sb=sb@entry=0xffff880079a10000, block_group=block_group@entry=0) at balloc.c:489 #4 0xffffffff81232fc0 in ext4_count_free_clusters (sb=sb@entry=0xffff880079a10000) at balloc.c:665 #5 0xffffffff81259ffa in ext4_check_descriptors (first_not_zeroed=, sb=0xffff880079a10000) at super.c:2143 #6 ext4_fill_super (sb=sb@entry=0xffff880079a10000, data=, data@entry=0x0 , silent=silent@entry=0) at super.c:3851 ... Signed-off-by: Azat Khuzhin Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 51 ++++++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f3c667091618..6f9e6fadac04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3869,19 +3869,38 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount2; } } + + /* + * set up enough so that it can read an inode, + * and create new inode for buddy allocator + */ + sbi->s_gdb_count = db_count; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; + + ext4_ext_init(sb); + err = ext4_mb_init(sb); + if (err) { + ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", + err); + goto failed_mount2; + } + if (!ext4_check_descriptors(sb, &first_not_zeroed)) { ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); - goto failed_mount2; + goto failed_mount2a; } if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) if (!ext4_fill_flex_info(sb)) { ext4_msg(sb, KERN_ERR, "unable to initialize " "flex_bg meta info!"); - goto failed_mount2; + goto failed_mount2a; } - sbi->s_gdb_count = db_count; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); @@ -3916,14 +3935,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; - /* - * set up enough so that it can read an inode - */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4113,21 +4124,13 @@ no_journal: if (err) { ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for " "reserved pool", ext4_calculate_resv_clusters(sb)); - goto failed_mount4a; + goto failed_mount5; } err = ext4_setup_system_zone(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize system " "zone (%d)", err); - goto failed_mount4a; - } - - ext4_ext_init(sb); - err = ext4_mb_init(sb); - if (err) { - ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", - err); goto failed_mount5; } @@ -4204,11 +4207,8 @@ failed_mount8: failed_mount7: ext4_unregister_li_request(sb); failed_mount6: - ext4_mb_release(sb); -failed_mount5: - ext4_ext_release(sb); ext4_release_system_zone(sb); -failed_mount4a: +failed_mount5: dput(sb->s_root); sb->s_root = NULL; failed_mount4: @@ -4232,11 +4232,14 @@ failed_mount3: percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); +failed_mount2a: + ext4_mb_release(sb); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); ext4_kvfree(sbi->s_group_desc); failed_mount: + ext4_ext_release(sb); if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); if (sbi->s_proc) { From 9503c67c93ed0b95ba62d12d1fd09da6245dbdd6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 7 Apr 2014 10:54:20 -0400 Subject: [PATCH 04/31] ext4: note the error in ext4_end_bio() ext4_end_bio() currently throws away the error that it receives. Chances are this is part of a spate of errors, one of which will end up getting the error returned to userspace somehow, but we shouldn't take that risk. Also print out the errno to aid in debug. Signed-off-by: Matthew Wilcox Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/page-io.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index ab95508e3d40..c18d95b50540 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -308,13 +308,14 @@ static void ext4_end_bio(struct bio *bio, int error) if (error) { struct inode *inode = io_end->inode; - ext4_warning(inode->i_sb, "I/O error writing to inode %lu " + ext4_warning(inode->i_sb, "I/O error %d writing to inode %lu " "(offset %llu size %ld starting block %llu)", - inode->i_ino, + error, inode->i_ino, (unsigned long long) io_end->offset, (long) io_end->size, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + mapping_set_error(inode->i_mapping, error); } if (io_end->flag & EXT4_IO_END_UNWRITTEN) { From ec4cb1aa2b7bae18dd8164f2e9c7c51abcf61280 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 7 Apr 2014 10:54:21 -0400 Subject: [PATCH 05/31] ext4: fix jbd2 warning under heavy xattr load When heavily exercising xattr code the assertion that jbd2_journal_dirty_metadata() shouldn't return error was triggered: WARNING: at /srv/autobuild-ceph/gitbuilder.git/build/fs/jbd2/transaction.c:1237 jbd2_journal_dirty_metadata+0x1ba/0x260() CPU: 0 PID: 8877 Comm: ceph-osd Tainted: G W 3.10.0-ceph-00049-g68d04c9 #1 Hardware name: Dell Inc. PowerEdge R410/01V648, BIOS 1.6.3 02/07/2011 ffffffff81a1d3c8 ffff880214469928 ffffffff816311b0 ffff880214469968 ffffffff8103fae0 ffff880214469958 ffff880170a9dc30 ffff8802240fbe80 0000000000000000 ffff88020b366000 ffff8802256e7510 ffff880214469978 Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x70/0xa0 [] warn_slowpath_null+0x1a/0x20 [] jbd2_journal_dirty_metadata+0x1ba/0x260 [] __ext4_handle_dirty_metadata+0xa3/0x140 [] ext4_xattr_release_block+0x103/0x1f0 [] ext4_xattr_block_set+0x1e0/0x910 [] ext4_xattr_set_handle+0x38b/0x4a0 [] ? trace_hardirqs_on+0xd/0x10 [] ext4_xattr_set+0xc2/0x140 [] ext4_xattr_user_set+0x47/0x50 [] generic_setxattr+0x6e/0x90 [] __vfs_setxattr_noperm+0x7b/0x1c0 [] vfs_setxattr+0xc4/0xd0 [] setxattr+0x13e/0x1e0 [] ? __sb_start_write+0xe7/0x1b0 [] ? mnt_want_write_file+0x28/0x60 [] ? fget_light+0x3c/0x130 [] ? mnt_want_write_file+0x28/0x60 [] ? __mnt_want_write+0x58/0x70 [] SyS_fsetxattr+0xbe/0x100 [] system_call_fastpath+0x16/0x1b The reason for the warning is that buffer_head passed into jbd2_journal_dirty_metadata() didn't have journal_head attached. This is caused by the following race of two ext4_xattr_release_block() calls: CPU1 CPU2 ext4_xattr_release_block() ext4_xattr_release_block() lock_buffer(bh); /* False */ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); unlock_buffer(bh); lock_buffer(bh); /* True */ if (BHDR(bh)->h_refcount == cpu_to_le32(1)) get_bh(bh); ext4_free_blocks() ... jbd2_journal_forget() jbd2_journal_unfile_buffer() -> JH is gone error = ext4_handle_dirty_xattr_block(handle, inode, bh); -> triggers the warning We fix the problem by moving ext4_handle_dirty_xattr_block() under the buffer lock. Sadly this cannot be done in nojournal mode as that function can call sync_dirty_buffer() which would deadlock. Luckily in nojournal mode the race is harmless (we only dirty already freed buffer) and thus for nojournal mode we leave the dirtying outside of the buffer lock. Reported-by: Sage Weil Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/xattr.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1f5cf5880718..4eec399ec807 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -520,8 +520,8 @@ static void ext4_xattr_update_super_block(handle_t *handle, } /* - * Release the xattr block BH: If the reference count is > 1, decrement - * it; otherwise free the block. + * Release the xattr block BH: If the reference count is > 1, decrement it; + * otherwise free the block. */ static void ext4_xattr_release_block(handle_t *handle, struct inode *inode, @@ -542,16 +542,31 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, if (ce) mb_cache_entry_free(ce); get_bh(bh); + unlock_buffer(bh); ext4_free_blocks(handle, inode, bh, 0, 1, EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET); - unlock_buffer(bh); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); + /* + * Beware of this ugliness: Releasing of xattr block references + * from different inodes can race and so we have to protect + * from a race where someone else frees the block (and releases + * its journal_head) before we are done dirtying the buffer. In + * nojournal mode this race is harmless and we actually cannot + * call ext4_handle_dirty_xattr_block() with locked buffer as + * that function can call sync_dirty_buffer() so for that case + * we handle the dirtying after unlocking the buffer. + */ + if (ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); unlock_buffer(bh); - error = ext4_handle_dirty_xattr_block(handle, inode, bh); + if (!ext4_handle_valid(handle)) + error = ext4_handle_dirty_xattr_block(handle, inode, + bh); if (IS_SYNC(inode)) ext4_handle_sync(handle); dquot_free_block(inode, EXT4_C2B(EXT4_SB(inode->i_sb), 1)); From 87f7e41636ff201148443551d06bc74497160aac Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 8 Apr 2014 11:38:28 -0400 Subject: [PATCH 06/31] ext4: update PF_MEMALLOC handling in ext4_write_inode() The special handling of PF_MEMALLOC callers in ext4_write_inode() shouldn't be necessary as there shouldn't be any. Warn about it. Also update comment before the function as it seems somewhat outdated. (Changes modeled on an ext3 patch posted by Jan Kara to the linux-ext4 mailing list on Februaryt 28, 2014, which apparently never went into the ext3 tree.) Signed-off-by: "Theodore Ts'o" Cc: Jan Kara --- fs/ext4/inode.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 93f16c5e8a8e..7b93df9aa182 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4427,21 +4427,20 @@ out_brelse: * * We are called from a few places: * - * - Within generic_file_write() for O_SYNC files. + * - Within generic_file_aio_write() -> generic_write_sync() for O_SYNC files. * Here, there will be no transaction running. We wait for any running * transaction to commit. * - * - Within sys_sync(), kupdate and such. - * We wait on commit, if tol to. + * - Within flush work (sys_sync(), kupdate and such). + * We wait on commit, if told to. * - * - Within prune_icache() (PF_MEMALLOC == true) - * Here we simply return. We can't afford to block kswapd on the - * journal commit. + * - Within iput_final() -> write_inode_now() + * We wait on commit, if told to. * * In all cases it is actually safe for us to return without doing anything, * because the inode has been copied into a raw inode buffer in - * ext4_mark_inode_dirty(). This is a correctness thing for O_SYNC and for - * knfsd. + * ext4_mark_inode_dirty(). This is a correctness thing for WB_SYNC_ALL + * writeback. * * Note that we are absolutely dependent upon all inode dirtiers doing the * right thing: they *must* call mark_inode_dirty() after dirtying info in @@ -4453,15 +4452,15 @@ out_brelse: * stuff(); * inode->i_size = expr; * - * is in error because a kswapd-driven write_inode() could occur while - * `stuff()' is running, and the new i_size will be lost. Plus the inode - * will no longer be on the superblock's dirty inode list. + * is in error because write_inode() could occur while `stuff()' is running, + * and the new i_size will be lost. Plus the inode will no longer be on the + * superblock's dirty inode list. */ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (current->flags & PF_MEMALLOC) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (EXT4_SB(inode->i_sb)->s_journal) { From 1ce01c4a199c50b023802be25261c0c02b2f0214 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 10 Apr 2014 22:58:20 -0400 Subject: [PATCH 07/31] ext4: fix COLLAPSE_RANGE test failure in data journalling mode When mounting ext4 with data=journal option, xfstest shared/002 and shared/004 are currently failing as checksum computed for testfile does not match with the checksum computed in other journal modes. In case of data=journal mode, a call to filemap_write_and_wait_range will not flush anything to disk as buffers are not marked dirty in write_end. In collapse range this call is followed by a call to truncate_pagecache_range. Due to this, when checksum is computed, a portion of file is re-read from disk which replace valid data with NULL bytes and hence the reason for the difference in checksum. Calling ext4_force_commit before filemap_write_and_wait_range solves the issue as it will mark the buffers dirty during commit transaction which can be later synced by a call to filemap_write_and_wait_range. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 82df3ce9874a..be1e56cbbf32 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5383,6 +5383,13 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); + /* Call ext4_force_commit to flush all data in case of data=journal. */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + /* Write out all dirty pages */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); if (ret) From c57ab39b9658315a742b6e61fdc86bb4d20cf566 Mon Sep 17 00:00:00 2001 From: Younger Liu Date: Thu, 10 Apr 2014 23:03:43 -0400 Subject: [PATCH 08/31] ext4: return ENOMEM rather than EIO when find_###_page() fails Return ENOMEM rather than EIO when find_get_page() fails in ext4_mb_get_buddy_page_lock() and find_or_create_page() fails in ext4_mb_load_buddy(). Signed-off-by: Younger Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a888cac76e9c..73ccbb3b973b 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -989,7 +989,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, poff = block % blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); @@ -1003,7 +1003,7 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, pnum = block / blocks_per_page; page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (!page) - return -EIO; + return -ENOMEM; BUG_ON(page->mapping != inode->i_mapping); e4b->bd_buddy_page = page; return 0; @@ -1168,7 +1168,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } @@ -1197,7 +1201,11 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, unlock_page(page); } } - if (page == NULL || !PageUptodate(page)) { + if (page == NULL) { + ret = -ENOMEM; + goto err; + } + if (!PageUptodate(page)) { ret = -EIO; goto err; } From 622cad1325e404598fe3b148c3fa640dbaabc235 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 11 Apr 2014 10:35:17 -0400 Subject: [PATCH 09/31] ext4: move ext4_update_i_disksize() into mpage_map_and_submit_extent() The function ext4_update_i_disksize() is used in only one place, in the function mpage_map_and_submit_extent(). Move its code to simplify the code paths, and also move the call to ext4_mark_inode_dirty() into the i_data_sem's critical region, to be consistent with all of the other places where we update i_disksize. That way, we also keep the raw_inode's i_disksize protected, to avoid the following race: CPU #1 CPU #2 down_write(&i_data_sem) Modify i_disk_size up_write(&i_data_sem) down_write(&i_data_sem) Modify i_disk_size Copy i_disk_size to on-disk inode up_write(&i_data_sem) Copy i_disk_size to on-disk inode Signed-off-by: "Theodore Ts'o" Reviewed-by: Jan Kara Cc: stable@vger.kernel.org --- fs/ext4/ext4.h | 17 ----------------- fs/ext4/inode.c | 14 ++++++++++++-- 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f1c65dc7cc0a..66946aa62127 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2466,23 +2466,6 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) up_write(&EXT4_I(inode)->i_data_sem); } -/* - * Update i_disksize after writeback has been started. Races with truncate - * are avoided by checking i_size under i_data_sem. - */ -static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize) -{ - loff_t i_size; - - down_write(&EXT4_I(inode)->i_data_sem); - i_size = i_size_read(inode); - if (newsize > i_size) - newsize = i_size; - if (newsize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = newsize; - up_write(&EXT4_I(inode)->i_data_sem); -} - struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7b93df9aa182..f023f0cb46fc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2247,13 +2247,23 @@ static int mpage_map_and_submit_extent(handle_t *handle, return err; } while (map->m_len); - /* Update on-disk size after IO is submitted */ + /* + * Update on-disk size after IO is submitted. Races with + * truncate are avoided by checking i_size under i_data_sem. + */ disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT; if (disksize > EXT4_I(inode)->i_disksize) { int err2; + loff_t i_size; - ext4_wb_update_i_disksize(inode, disksize); + down_write(&EXT4_I(inode)->i_data_sem); + i_size = i_size_read(inode); + if (disksize > i_size) + disksize = i_size; + if (disksize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = disksize; err2 = ext4_mark_inode_dirty(handle, inode); + up_write(&EXT4_I(inode)->i_data_sem); if (err2) ext4_error(inode->i_sb, "Failed to mark inode %lu dirty", From 9ef06cec7c96f6bf59f1dd8b64b9645820099051 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:47:00 -0400 Subject: [PATCH 10/31] ext4: remove unnecessary check for APPEND and IMMUTABLE All the checks IS_APPEND and IS_IMMUTABLE for the fallocate operation on the inode are done in vfs. No need to do this again in ext4. Remove it. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ------ fs/ext4/inode.c | 6 +----- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index be1e56cbbf32..ed4ec48239b6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5398,12 +5398,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Take mutex lock */ mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } - if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f023f0cb46fc..e2bba76f0d7b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3541,11 +3541,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } mutex_lock(&inode->i_mutex); - /* It's not possible punch hole on append only file */ - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) { - ret = -EPERM; - goto out_mutex; - } + if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; From 8fc61d92630d1c96057a94c61e1643475045b25b Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:51:34 -0400 Subject: [PATCH 11/31] fs: prevent doing FALLOC_FL_ZERO_RANGE on append only file Currently punch hole and collapse range fallocate operation are not allowed on append only file. This should be case for zero range as well. Fix it by allowing only pure fallocate (possibly with keep size set). Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/open.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/open.c b/fs/open.c index 631aea815def..3a83253d3373 100644 --- a/fs/open.c +++ b/fs/open.c @@ -254,11 +254,9 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) return -EBADF; /* - * It's not possible to punch hole or perform collapse range - * on append only file + * We can only allow pure fallocate on append only files */ - if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE) - && IS_APPEND(inode)) + if ((mode & ~FALLOC_FL_KEEP_SIZE) && IS_APPEND(inode)) return -EPERM; if (IS_IMMUTABLE(inode)) From 23fffa925ea2c9a2bcb1a4453e2c542635aa3545 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 09:56:41 -0400 Subject: [PATCH 12/31] fs: move falloc collapse range check into the filesystem methods Currently in do_fallocate in collapse range case we're checking whether offset + len is not bigger than i_size. However there is nothing which would prevent i_size from changing so the check is pointless. It should be done in the file system itself and the file system needs to make sure that i_size is not going to change. The i_size check for the other fallocate modes are also done in the filesystems. As it is now we can easily crash the kernel by having two processes doing truncate and fallocate collapse range at the same time. This can be reproduced on ext4 and it is theoretically possible on xfs even though I was not able to trigger it with this simple test. This commit removes the check from do_fallocate and adds it to the file system. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Acked-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/ext4/extents.c | 11 +++++++++-- fs/open.c | 8 -------- fs/xfs/xfs_file.c | 10 +++++++++- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ed4ec48239b6..ac5460d0d133 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5368,8 +5368,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) loff_t new_size; int ret; - BUG_ON(offset + len > i_size_read(inode)); - /* Collapse range works only on fs block size aligned offsets. */ if (offset & (EXT4_BLOCK_SIZE(sb) - 1) || len & (EXT4_BLOCK_SIZE(sb) - 1)) @@ -5398,6 +5396,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) /* Take mutex lock */ mutex_lock(&inode->i_mutex); + /* + * There is no need to overlap collapse range with EOF, in which case + * it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; goto out_mutex; diff --git a/fs/open.c b/fs/open.c index 3a83253d3373..adf34202213a 100644 --- a/fs/open.c +++ b/fs/open.c @@ -284,14 +284,6 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) return -EFBIG; - /* - * There is no need to overlap collapse range with EOF, in which case - * it is effectively a truncate operation - */ - if ((mode & FALLOC_FL_COLLAPSE_RANGE) && - (offset + len >= i_size_read(inode))) - return -EINVAL; - if (!file->f_op->fallocate) return -EOPNOTSUPP; diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7abff8c16ca..3cb528c4f27c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -840,7 +840,15 @@ xfs_file_fallocate( goto out_unlock; } - ASSERT(offset + len < i_size_read(inode)); + /* + * There is no need to overlap collapse range with EOF, + * in which case it is effectively a truncate operation + */ + if (offset + len >= i_size_read(inode)) { + error = -EINVAL; + goto out_unlock; + } + new_size = i_size_read(inode) - len; error = xfs_collapse_file_space(ip, offset, len); From 0790b31b69374ddadefebb156251b319e5b43345 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Sat, 12 Apr 2014 10:05:37 -0400 Subject: [PATCH 13/31] fs: disallow all fallocate operation on active swapfile Currently some file system have IS_SWAPFILE check in their fallocate implementations and some do not. However we should really prevent any fallocate operation on swapfile so move the check to vfs and remove the redundant checks from the file systems fallocate implementations. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ceph/file.c | 3 --- fs/ext4/extents.c | 5 ----- fs/ext4/inode.c | 5 ----- fs/open.c | 7 +++++++ 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 09c7afe32e49..596e6cc9f9c4 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1215,9 +1215,6 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ac5460d0d133..b2d3869b5762 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5405,11 +5405,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } - /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e2bba76f0d7b..b74cfd2a42ec 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3542,11 +3542,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) mutex_lock(&inode->i_mutex); - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_mutex; - } - /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) goto out_mutex; diff --git a/fs/open.c b/fs/open.c index adf34202213a..7b823daa6a93 100644 --- a/fs/open.c +++ b/fs/open.c @@ -262,6 +262,13 @@ int do_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (IS_IMMUTABLE(inode)) return -EPERM; + /* + * We can not allow to do any fallocate operation on an active + * swapfile + */ + if (IS_SWAPFILE(inode)) + ret = -ETXTBSY; + /* * Revalidate the write permissions, in case security policy has * changed since the files were opened. From 6e6358fc3c3c862bfe9a5bc029d3f8ce43dc9765 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Apr 2014 12:45:25 -0400 Subject: [PATCH 14/31] ext4: use i_size_read in ext4_unaligned_aio() We haven't taken i_mutex yet, so we need to use i_size_read(). Signed-off-by: "Theodore Ts'o" Cc: stable@vger.kernel.org --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6db7f7db7777..bc765591101a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -82,7 +82,7 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); loff_t final_size = pos + count; - if (pos >= inode->i_size) + if (pos >= i_size_read(inode)) return 0; if ((pos & blockmask) || (final_size & blockmask)) From 847c6c422aa0ae81a5517a9558ec2737806dca48 Mon Sep 17 00:00:00 2001 From: Zheng Liu Date: Sat, 12 Apr 2014 12:45:55 -0400 Subject: [PATCH 15/31] ext4: fix byte order problems introduced by the COLLAPSE_RANGE patches This commit tries to fix some byte order issues that is found by sparse check. $ make M=fs/ext4 C=2 CF=-D__CHECK_ENDIAN__ ... CHECK fs/ext4/extents.c fs/ext4/extents.c:5232:41: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5236:52: warning: bad assignment (-=) to restricted __le32 fs/ext4/extents.c:5258:45: warning: bad assignment (-=) to restricted __le32 fs/ext4/extents.c:5303:28: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5318:18: warning: incorrect type in assignment (different base types) fs/ext4/extents.c:5318:18: expected unsigned int [unsigned] [usertype] ex_start fs/ext4/extents.c:5318:18: got restricted __le32 [usertype] ee_block fs/ext4/extents.c:5319:24: warning: restricted __le32 degrades to integer fs/ext4/extents.c:5334:31: warning: incorrect type in assignment (different base types) ... Cc: Andreas Dilger Cc: Namjae Jeon Signed-off-by: Zheng Liu Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index b2d3869b5762..f24ef8697609 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5229,11 +5229,11 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = ex_last->ee_block + + *start = le32_to_cpu(ex_last->ee_block) + ext4_ext_get_actual_len(ex_last); while (ex_start <= ex_last) { - ex_start->ee_block -= shift; + le32_add_cpu(&ex_start->ee_block, -shift); if (ex_start > EXT_FIRST_EXTENT(path[depth].p_hdr)) { if (ext4_ext_try_to_merge_right(inode, @@ -5255,7 +5255,7 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - path[depth].p_idx->ei_block -= shift; + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5300,7 +5300,8 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return ret; } - stop_block = extent->ee_block + ext4_ext_get_actual_len(extent); + stop_block = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5315,8 +5316,9 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, path = ext4_ext_find_extent(inode, start - 1, NULL, 0); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = extent->ee_block; - ex_end = extent->ee_block + ext4_ext_get_actual_len(extent); + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); ext4_ext_drop_refs(path); kfree(path); @@ -5331,7 +5333,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - current_block = extent->ee_block; + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ ret = mext_next_extent(inode, path, &extent); From 40c406c74eb9eed58ae7d4d12a0197f7279c9499 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 12 Apr 2014 22:53:53 -0400 Subject: [PATCH 16/31] ext4: COLLAPSE_RANGE only works on extent-based files Unfortunately, we weren't checking to make sure of this the inode was extent-based before attempt operate on it. Hilarity ensues. Signed-off-by: "Theodore Ts'o" Cc: Namjae Jeon --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f24ef8697609..96e0a4bc8faa 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4878,9 +4878,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_PUNCH_HOLE) return ext4_punch_hole(inode, offset, len); - if (mode & FALLOC_FL_COLLAPSE_RANGE) - return ext4_collapse_range(inode, offset, len); - ret = ext4_convert_inline_data(inode); if (ret) return ret; @@ -4892,6 +4889,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) return -EOPNOTSUPP; + if (mode & FALLOC_FL_COLLAPSE_RANGE) + return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); From e2cbd587418251bb73c4c1e8e2c7c1816d7a98d9 Mon Sep 17 00:00:00 2001 From: jon ernst Date: Sat, 12 Apr 2014 23:01:28 -0400 Subject: [PATCH 17/31] ext4: silence sparse check warning for function ext4_trim_extent This fixes the following sparse warning: CHECK fs/ext4/mballoc.c fs/ext4/mballoc.c:5019:9: warning: context imbalance in 'ext4_trim_extent' - unexpected unlock Signed-off-by: "Jon Ernst" Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 73ccbb3b973b..c8238a26818c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5016,6 +5016,8 @@ error_return: */ static int ext4_trim_extent(struct super_block *sb, int start, int count, ext4_group_t group, struct ext4_buddy *e4b) +__releases(bitlock) +__acquires(bitlock) { struct ext4_free_extent ex; int ret = 0; From 8dc79ec4c0537e1b83c0739af82a7babefb30012 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 13 Apr 2014 15:05:42 -0400 Subject: [PATCH 18/31] ext4: fix error handling in ext4_ext_shift_extents Fix error handling by adding some. :-) Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 96e0a4bc8faa..38be06354b88 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5314,11 +5314,18 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, * enough to accomodate the shift. */ path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } ext4_ext_drop_refs(path); kfree(path); From a18ed359bdddcded4f97ff5e2f07793ff9336913 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Sun, 13 Apr 2014 15:41:13 -0400 Subject: [PATCH 19/31] ext4: always check ext4_ext_find_extent result Where are some places where logic guaranties us that extent we are searching exits, but this may not be true due to on-disk data corruption. If such corruption happens we must prevent possible null pointer dereferences. Signed-off-by: Dmitry Monakhov Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 38be06354b88..64b400356cad 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3313,6 +3313,11 @@ static int ext4_split_extent(handle_t *handle, return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } uninitialized = ext4_ext_is_uninitialized(ex); split_flag1 = 0; @@ -3694,6 +3699,12 @@ static int ext4_convert_initialized_extents(handle_t *handle, } depth = ext_depth(inode); ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + err = -EIO; + goto out; + } } err = ext4_ext_get_access(handle, inode, path + depth); @@ -5340,6 +5351,12 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; + if (!extent) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) start); + return -EIO; + } + current_block = le32_to_cpu(extent->ee_block); if (start > current_block) { /* Hole, move to the next extent */ From 036acea2ceabd19cb5734ae7a9d64c0a5ef90484 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 14 Apr 2014 23:36:15 -0400 Subject: [PATCH 20/31] ext4: fix ext4_count_free_clusters() with EXT4FS_DEBUG and bigalloc enabled With bigalloc enabled we must use EXT4_CLUSTERS_PER_GROUP() instead of EXT4_BLOCKS_PER_GROUP() otherwise we will go beyond the allocated buffer. $ mount -t ext4 /dev/vde /vde [ 70.573993] EXT4-fs DEBUG (fs/ext4/mballoc.c, 2346): ext4_mb_alloc_groupinfo: [ 70.575174] allocated s_groupinfo array for 1 meta_bg's [ 70.576172] EXT4-fs DEBUG (fs/ext4/super.c, 2092): ext4_check_descriptors: [ 70.576972] Checking group descriptorsBUG: unable to handle kernel paging request at ffff88006ab56000 [ 72.463686] IP: [] __bitmap_weight+0x2a/0x7f [ 72.464168] PGD 295e067 PUD 2961067 PMD 7fa8e067 PTE 800000006ab56060 [ 72.464738] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC [ 72.465139] Modules linked in: [ 72.465402] CPU: 1 PID: 3560 Comm: mount Tainted: G W 3.14.0-rc2-00069-ge57bce1 #60 [ 72.466079] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 72.466505] task: ffff88007ce6c8a0 ti: ffff88006b7f0000 task.ti: ffff88006b7f0000 [ 72.466505] RIP: 0010:[] [] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP: 0018:ffff88006b7f1c00 EFLAGS: 00010206 [ 72.466505] RAX: 0000000000000000 RBX: 000000000000050a RCX: 0000000000000040 [ 72.466505] RDX: 0000000000000000 RSI: 0000000000080000 RDI: 0000000000000000 [ 72.466505] RBP: ffff88006b7f1c28 R08: 0000000000000002 R09: 0000000000000000 [ 72.466505] R10: 000000000000babe R11: 0000000000000400 R12: 0000000000080000 [ 72.466505] R13: 0000000000000200 R14: 0000000000002000 R15: ffff88006ab55000 [ 72.466505] FS: 00007f43ba1fa840(0000) GS:ffff88007f800000(0000) knlGS:0000000000000000 [ 72.466505] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 72.466505] CR2: ffff88006ab56000 CR3: 000000006b7e6000 CR4: 00000000000006e0 [ 72.466505] Stack: [ 72.466505] ffff88006ab65000 0000000000000000 0000000000000000 0000000000010000 [ 72.466505] ffff88006ab6f400 ffff88006b7f1c58 ffffffff81396bb8 0000000000010000 [ 72.466505] 0000000000000000 ffff88007b869a90 ffff88006a48a000 ffff88006b7f1c70 [ 72.466505] Call Trace: [ 72.466505] [] memweight+0x5f/0x8a [ 72.466505] [] ext4_count_free+0x13/0x21 [ 72.466505] [] ext4_count_free_clusters+0xdb/0x171 [ 72.466505] [] ext4_fill_super+0x117c/0x28ef [ 72.466505] [] ? vsnprintf+0x1c7/0x3f7 [ 72.466505] [] mount_bdev+0x145/0x19c [ 72.466505] [] ? ext4_calculate_overhead+0x2a1/0x2a1 [ 72.466505] [] ext4_mount+0x15/0x17 [ 72.466505] [] mount_fs+0x67/0x150 [ 72.466505] [] vfs_kern_mount+0x64/0xde [ 72.466505] [] do_mount+0x6fe/0x7f5 [ 72.466505] [] ? strndup_user+0x3a/0xd9 [ 72.466505] [] SyS_mount+0x85/0xbe [ 72.466505] [] tracesys+0xdd/0xe2 [ 72.466505] Code: c3 89 f0 b9 40 00 00 00 55 99 48 89 e5 41 57 f7 f9 41 56 49 89 ff 41 55 45 31 ed 41 54 41 89 f4 53 31 db 41 89 c6 45 39 ee 7e 10 <4b> 8b 3c ef 49 ff c5 e8 bf ff ff ff 01 c3 eb eb 31 c0 45 85 f6 [ 72.466505] RIP [] __bitmap_weight+0x2a/0x7f [ 72.466505] RSP [ 72.466505] CR2: ffff88006ab56000 [ 72.466505] ---[ end trace 7d051a08ae138573 ]--- Killed Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6ea7b1436bbc..5c56785007e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -667,7 +667,7 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh->b_data, - EXT4_BLOCKS_PER_GROUP(sb) / 8); + EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; From 50e02fd84543d82e663000e780e0ec0cfde52283 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 14 Apr 2014 23:37:35 -0400 Subject: [PATCH 21/31] ext4: remove temporary shim used to merge COLLAPSE_RANGE and ZERO_RANGE In retrospect, this was a bad way to handle things, since it limited testing of these patches. We should just get the VFS level changes merged in first. Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 010ea89eeb0e..6a1a0245474f 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -16,15 +16,6 @@ struct mpage_da_data; struct ext4_map_blocks; struct extent_status; -/* shim until we merge in the xfs_collapse_range branch */ -#ifndef FALLOC_FL_COLLAPSE_RANGE -#define FALLOC_FL_COLLAPSE_RANGE 0x08 -#endif - -#ifndef FALLOC_FL_ZERO_RANGE -#define FALLOC_FL_ZERO_RANGE 0x10 -#endif - #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) #define show_mballoc_flags(flags) __print_flags(flags, "|", \ From 694c793fc1ade0946149c5f8d43f71e0728c4e81 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:21:15 -0400 Subject: [PATCH 22/31] ext4: use truncate_pagecache() in collapse range We should be using truncate_pagecache() instead of truncate_pagecache_range() in the collapse range because we're truncating page cache from offset to the end of file. truncate_pagecache() also get rid of the private COWed pages from the range because we're going to shift the end of the file. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 64b400356cad..3de9b2d7028c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5437,7 +5437,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache_range(inode, offset, -1); + truncate_pagecache(inode, offset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); From 1a66c7c3bea52ba0f7596b8940d74fce75281d16 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:41:52 -0400 Subject: [PATCH 23/31] ext4: use filemap_write_and_wait_range() correctly in collapse range Currently we're passing -1 as lend argumnet for filemap_write_and_wait_range() which is wrong since lend is signed type so it would cause some confusion and we might not write_and_wait for the entire range we're expecting to write. Fix it by using LLONG_MAX instead. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3de9b2d7028c..f4a676908b0b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5415,7 +5415,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, -1); + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) return ret; From 2c1d23289bc2f7cfa358bc856b87a992dcb11ad5 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:43:21 -0400 Subject: [PATCH 24/31] ext4: fix removing status extents in ext4_collapse_range() Currently in ext4_collapse_range() when calling ext4_es_remove_extent() to remove status extents we're passing (EXT_MAX_BLOCKS - punch_start - 1) in order to remove all extents from start of the collapse range to the end of the file. However this is wrong because we might miss the possible extent covering the last block of the file. Fix it by removing the -1. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Namjae Jeon --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index f4a676908b0b..c6f624582d37 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5454,7 +5454,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_es_remove_extent(inode, punch_start, - EXT_MAX_BLOCKS - punch_start - 1); + EXT_MAX_BLOCKS - punch_start); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; From 9337d5d31ab798f0c74150506371551a9195251a Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:48:25 -0400 Subject: [PATCH 25/31] ext4: no need to truncate pagecache twice in collapse range We're already calling truncate_pagecache() before we attempt to do any actual job so there is not need to truncate pagecache once more using truncate_setsize() after we're finished. Remove truncate_setsize() and replace it just with i_size_write() note that we're holding appropriate locks. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c6f624582d37..3ee60e2e2ac7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5474,7 +5474,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) } new_size = i_size_read(inode) - len; - truncate_setsize(inode, new_size); + i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; ext4_discard_preallocations(inode); From ef24f6c234de9a03aed9368163dbaad9a4f6391f Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:50:23 -0400 Subject: [PATCH 26/31] ext4: discard preallocations after removing space Currently in ext4_collapse_range() and ext4_punch_hole() we're discarding preallocation twice. Once before we attempt to do any changes and second time after we're done with the changes. While the second call to ext4_discard_preallocations() in ext4_punch_hole() case is not needed, we need to discard preallocation right after ext4_ext_remove_space() in collapse range case because in the case we had to restart a transaction in the middle of removing space we might have new preallocations created. Remove unneeded ext4_discard_preallocations() ext4_punch_hole() and move it to the better place in ext4_collapse_range() Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- fs/ext4/inode.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3ee60e2e2ac7..eb7be8f08e10 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5465,6 +5465,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; } + ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, punch_stop - punch_start); @@ -5477,7 +5478,6 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) i_size_write(inode, new_size); EXT4_I(inode)->i_disksize = new_size; - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b74cfd2a42ec..d7b7462a0e13 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3621,7 +3621,6 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ret = ext4_free_hole_blocks(handle, inode, first_block, stop_block); - ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); if (IS_SYNC(inode)) ext4_handle_sync(handle); From 6dd834effc12ba71092d9d1e4944530234b58ab1 Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 18 Apr 2014 10:55:24 -0400 Subject: [PATCH 27/31] ext4: fix extent merging in ext4_ext_shift_path_extents() There is a bug in ext4_ext_shift_path_extents() where if we actually manage to merge a extent we would skip shifting the next extent. This will result in in one extent in the extent tree not being properly shifted. This is causing failure in various xfstests tests using fsx or fsstress with collapse range support. It will also cause file system corruption which looks something like: e2fsck 1.42.9 (4-Feb-2014) Pass 1: Checking inodes, blocks, and sizes Inode 20 has out of order extents (invalid logical block 3, physical block 492938, len 2) Clear? yes ... when running e2fsck. It's also very easily reproducible just by running fsx without any parameters. I can usually hit the problem within a minute. Fix it by increasing ex_start only if we're not merging the extent. Signed-off-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" Reviewed-by: Namjae Jeon --- fs/ext4/extents.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index eb7be8f08e10..d0860f2d36d0 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5245,13 +5245,14 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, while (ex_start <= ex_last) { le32_add_cpu(&ex_start->ee_block, -shift); - if (ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) { - if (ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) - ex_last--; - } - ex_start++; + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; } err = ext4_ext_dirty(handle, inode, path + depth); if (err) From 6c5e73d3a26b73bfcac0b4a932cb918177d067f2 Mon Sep 17 00:00:00 2001 From: jon ernst Date: Fri, 18 Apr 2014 11:50:35 -0400 Subject: [PATCH 28/31] ext4: enforce we are operating on a regular file in ext4_zero_range() Signed-off-by: Jon Ernst Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d0860f2d36d0..2f49b12a4c40 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4741,6 +4741,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, trace_ext4_zero_range(inode, offset, len, mode); + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + /* * Write out all dirty pages to avoid race conditions * Then release them. From 86f1ca3889142d5959362c5694db3f3dc26f377a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 18 Apr 2014 11:52:11 -0400 Subject: [PATCH 29/31] ext4: use EINVAL if not a regular file in ext4_collapse_range() Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2f49b12a4c40..9b9251adb400 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5404,7 +5404,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return -EINVAL; if (!S_ISREG(inode->i_mode)) - return -EOPNOTSUPP; + return -EINVAL; trace_ext4_collapse_range(inode, offset, len); From a8680e0d5efd46aa54d7085e5b4a268f726922c7 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 19 Apr 2014 16:37:31 -0400 Subject: [PATCH 30/31] ext4: fix COLLAPSE_RANGE failure with 1KB block size When formatting with 1KB or 2KB(not aligned with PAGE SIZE) block size, xfstests generic/075 and 091 are failing. The offset supplied to function truncate_pagecache_range is block size aligned. In this function start offset is re-aligned to PAGE_SIZE by rounding_up to the next page boundary. Due to this rounding up, old data remains in the page cache when blocksize is less than page size and start offset is not aligned with page size. In case of collapse range, we need to align start offset to page size boundary by doing a round down operation instead of round up. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9b9251adb400..d6bca2a1debe 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5395,7 +5395,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_lblk_t punch_start, punch_stop; handle_t *handle; unsigned int credits; - loff_t new_size; + loff_t new_size, ioffset; int ret; /* Collapse range works only on fs block size aligned offsets. */ @@ -5418,8 +5418,15 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); if (ret) return ret; @@ -5441,7 +5448,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, offset); + truncate_pagecache(inode, ioffset); /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); From 0a04b248532b358b27a8da050642da6f5f304b03 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 19 Apr 2014 16:38:21 -0400 Subject: [PATCH 31/31] ext4: disable COLLAPSE_RANGE for bigalloc Once COLLAPSE RANGE is be disable for ext4 with bigalloc feature till finding root-cause of problem. It will be enable with fixing that regression of xfstest(generic 075 and 091) again. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Reviewed-by: Lukas Czerner Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d6bca2a1debe..01b0c208f625 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -5406,6 +5406,9 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode)) return -EINVAL; + if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) + return -EOPNOTSUPP; + trace_ext4_collapse_range(inode, offset, len); punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb);