ext4: Convert to use mapping->invalidate_lock
Convert ext4 to use mapping->invalidate_lock instead of its private EXT4_I(inode)->i_mmap_sem. This is mostly search-and-replace. By this conversion we fix a long standing race between hole punching and read(2) / readahead(2) paths that can lead to stale page cache contents. CC: <linux-ext4@vger.kernel.org> CC: Ted Tso <tytso@mit.edu> Acked-by: Theodore Ts'o <tytso@mit.edu> Reviewed-by: Darrick J. Wong <djwong@kernel.org> Signed-off-by: Jan Kara <jack@suse.cz>
This commit is contained in:
parent
7506ae6a70
commit
d4f5258eae
|
@ -1086,15 +1086,6 @@ struct ext4_inode_info {
|
|||
* by other means, so we have i_data_sem.
|
||||
*/
|
||||
struct rw_semaphore i_data_sem;
|
||||
/*
|
||||
* i_mmap_sem is for serializing page faults with truncate / punch hole
|
||||
* operations. We have to make sure that new page cannot be faulted in
|
||||
* a section of the inode that is being punched. We cannot easily use
|
||||
* i_data_sem for this since we need protection for the whole punch
|
||||
* operation and i_data_sem ranks below transaction start so we have
|
||||
* to occasionally drop it.
|
||||
*/
|
||||
struct rw_semaphore i_mmap_sem;
|
||||
struct inode vfs_inode;
|
||||
struct jbd2_inode *jinode;
|
||||
|
||||
|
@ -2972,7 +2963,6 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
|
|||
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
|
||||
loff_t lstart, loff_t lend);
|
||||
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
|
||||
extern vm_fault_t ext4_filemap_fault(struct vm_fault *vmf);
|
||||
extern qsize_t *ext4_get_reserved_space(struct inode *inode);
|
||||
extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
|
||||
extern void ext4_da_release_space(struct inode *inode, int to_free);
|
||||
|
|
|
@ -4474,6 +4474,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
|||
loff_t len, int mode)
|
||||
{
|
||||
struct inode *inode = file_inode(file);
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
handle_t *handle = NULL;
|
||||
unsigned int max_blocks;
|
||||
loff_t new_size = 0;
|
||||
|
@ -4560,17 +4561,17 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
|||
* Prevent page faults from reinstantiating pages we have
|
||||
* released from page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(mapping);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
goto out_mutex;
|
||||
}
|
||||
|
||||
ret = ext4_update_disksize_before_punch(inode, offset, len);
|
||||
if (ret) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
goto out_mutex;
|
||||
}
|
||||
/* Now release the pages and zero block aligned part of pages */
|
||||
|
@ -4579,7 +4580,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
|||
|
||||
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
|
||||
flags);
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
if (ret)
|
||||
goto out_mutex;
|
||||
}
|
||||
|
@ -5221,6 +5222,7 @@ out:
|
|||
static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
ext4_lblk_t punch_start, punch_stop;
|
||||
handle_t *handle;
|
||||
unsigned int credits;
|
||||
|
@ -5274,7 +5276,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
|
|||
* Prevent page faults from reinstantiating pages we have released from
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(mapping);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret)
|
||||
|
@ -5289,15 +5291,15 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
|
|||
* Write tail of the last page before removed range since it will get
|
||||
* removed from the page cache below.
|
||||
*/
|
||||
ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
|
||||
ret = filemap_write_and_wait_range(mapping, ioffset, offset);
|
||||
if (ret)
|
||||
goto out_mmap;
|
||||
/*
|
||||
* Write data that will be shifted to preserve them when discarding
|
||||
* page cache below. We are also protected from pages becoming dirty
|
||||
* by i_mmap_sem.
|
||||
* by i_rwsem and invalidate_lock.
|
||||
*/
|
||||
ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
|
||||
ret = filemap_write_and_wait_range(mapping, offset + len,
|
||||
LLONG_MAX);
|
||||
if (ret)
|
||||
goto out_mmap;
|
||||
|
@ -5350,7 +5352,7 @@ out_stop:
|
|||
ext4_journal_stop(handle);
|
||||
ext4_fc_stop_ineligible(sb);
|
||||
out_mmap:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
out_mutex:
|
||||
inode_unlock(inode);
|
||||
return ret;
|
||||
|
@ -5367,6 +5369,7 @@ out_mutex:
|
|||
static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
{
|
||||
struct super_block *sb = inode->i_sb;
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
handle_t *handle;
|
||||
struct ext4_ext_path *path;
|
||||
struct ext4_extent *extent;
|
||||
|
@ -5425,7 +5428,7 @@ static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
|
|||
* Prevent page faults from reinstantiating pages we have released from
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(mapping);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret)
|
||||
|
@ -5526,7 +5529,7 @@ out_stop:
|
|||
ext4_journal_stop(handle);
|
||||
ext4_fc_stop_ineligible(sb);
|
||||
out_mmap:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
out_mutex:
|
||||
inode_unlock(inode);
|
||||
return ret;
|
||||
|
|
|
@ -704,22 +704,23 @@ static vm_fault_t ext4_dax_huge_fault(struct vm_fault *vmf,
|
|||
*/
|
||||
bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
|
||||
(vmf->vma->vm_flags & VM_SHARED);
|
||||
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
|
||||
pfn_t pfn;
|
||||
|
||||
if (write) {
|
||||
sb_start_pagefault(sb);
|
||||
file_update_time(vmf->vma->vm_file);
|
||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock_shared(mapping);
|
||||
retry:
|
||||
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
|
||||
EXT4_DATA_TRANS_BLOCKS(sb));
|
||||
if (IS_ERR(handle)) {
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock_shared(mapping);
|
||||
sb_end_pagefault(sb);
|
||||
return VM_FAULT_SIGBUS;
|
||||
}
|
||||
} else {
|
||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock_shared(mapping);
|
||||
}
|
||||
result = dax_iomap_fault(vmf, pe_size, &pfn, &error, &ext4_iomap_ops);
|
||||
if (write) {
|
||||
|
@ -731,10 +732,10 @@ retry:
|
|||
/* Handling synchronous page fault? */
|
||||
if (result & VM_FAULT_NEEDDSYNC)
|
||||
result = dax_finish_sync_fault(vmf, pe_size, pfn);
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock_shared(mapping);
|
||||
sb_end_pagefault(sb);
|
||||
} else {
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock_shared(mapping);
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -756,7 +757,7 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
|
|||
#endif
|
||||
|
||||
static const struct vm_operations_struct ext4_file_vm_ops = {
|
||||
.fault = ext4_filemap_fault,
|
||||
.fault = filemap_fault,
|
||||
.map_pages = filemap_map_pages,
|
||||
.page_mkwrite = ext4_page_mkwrite,
|
||||
};
|
||||
|
|
|
@ -3950,20 +3950,19 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
|
|||
return ret;
|
||||
}
|
||||
|
||||
static void ext4_wait_dax_page(struct ext4_inode_info *ei)
|
||||
static void ext4_wait_dax_page(struct inode *inode)
|
||||
{
|
||||
up_write(&ei->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
schedule();
|
||||
down_write(&ei->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
}
|
||||
|
||||
int ext4_break_layouts(struct inode *inode)
|
||||
{
|
||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||
struct page *page;
|
||||
int error;
|
||||
|
||||
if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
|
||||
if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock)))
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
|
@ -3974,7 +3973,7 @@ int ext4_break_layouts(struct inode *inode)
|
|||
error = ___wait_var_event(&page->_refcount,
|
||||
atomic_read(&page->_refcount) == 1,
|
||||
TASK_INTERRUPTIBLE, 0, 0,
|
||||
ext4_wait_dax_page(ei));
|
||||
ext4_wait_dax_page(inode));
|
||||
} while (error == 0);
|
||||
|
||||
return error;
|
||||
|
@ -4005,9 +4004,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
|
|||
|
||||
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
|
||||
if (ext4_has_inline_data(inode)) {
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(mapping);
|
||||
ret = ext4_convert_inline_data(inode);
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
@ -4058,7 +4057,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
|
|||
* Prevent page faults from reinstantiating pages we have released from
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(mapping);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret)
|
||||
|
@ -4131,7 +4130,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
|
|||
out_stop:
|
||||
ext4_journal_stop(handle);
|
||||
out_dio:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
out_mutex:
|
||||
inode_unlock(inode);
|
||||
return ret;
|
||||
|
@ -5426,11 +5425,11 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
|
|||
inode_dio_wait(inode);
|
||||
}
|
||||
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
|
||||
rc = ext4_break_layouts(inode);
|
||||
if (rc) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
|
@ -5506,7 +5505,7 @@ int ext4_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
|
|||
error = rc;
|
||||
}
|
||||
out_mmap_sem:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
|
@ -5983,10 +5982,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
|
|||
* data (and journalled aops don't know how to handle these cases).
|
||||
*/
|
||||
if (val) {
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
err = filemap_write_and_wait(inode->i_mapping);
|
||||
if (err < 0) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
@ -6019,7 +6018,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
|
|||
percpu_up_write(&sbi->s_writepages_rwsem);
|
||||
|
||||
if (val)
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
|
||||
/* Finally we can mark the inode as dirty. */
|
||||
|
||||
|
@ -6063,7 +6062,7 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
|
|||
sb_start_pagefault(inode->i_sb);
|
||||
file_update_time(vma->vm_file);
|
||||
|
||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock_shared(mapping);
|
||||
|
||||
err = ext4_convert_inline_data(inode);
|
||||
if (err)
|
||||
|
@ -6176,7 +6175,7 @@ retry_alloc:
|
|||
out_ret:
|
||||
ret = block_page_mkwrite_return(err);
|
||||
out:
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock_shared(mapping);
|
||||
sb_end_pagefault(inode->i_sb);
|
||||
return ret;
|
||||
out_error:
|
||||
|
@ -6184,15 +6183,3 @@ out_error:
|
|||
ext4_journal_stop(handle);
|
||||
goto out;
|
||||
}
|
||||
|
||||
vm_fault_t ext4_filemap_fault(struct vm_fault *vmf)
|
||||
{
|
||||
struct inode *inode = file_inode(vmf->vma->vm_file);
|
||||
vm_fault_t ret;
|
||||
|
||||
down_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
ret = filemap_fault(vmf);
|
||||
up_read(&EXT4_I(inode)->i_mmap_sem);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -148,7 +148,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
|
|||
goto journal_err_out;
|
||||
}
|
||||
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
err = filemap_write_and_wait(inode->i_mapping);
|
||||
if (err)
|
||||
goto err_out;
|
||||
|
@ -256,7 +256,7 @@ err_out1:
|
|||
ext4_double_up_write_data_sem(inode, inode_bl);
|
||||
|
||||
err_out:
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(inode->i_mapping);
|
||||
journal_err_out:
|
||||
unlock_two_nondirectories(inode, inode_bl);
|
||||
iput(inode_bl);
|
||||
|
|
|
@ -90,12 +90,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
|
|||
/*
|
||||
* Lock ordering
|
||||
*
|
||||
* Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
|
||||
* i_mmap_rwsem (inode->i_mmap_rwsem)!
|
||||
*
|
||||
* page fault path:
|
||||
* mmap_lock -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
|
||||
* page lock -> i_data_sem (rw)
|
||||
* mmap_lock -> sb_start_pagefault -> invalidate_lock (r) -> transaction start
|
||||
* -> page lock -> i_data_sem (rw)
|
||||
*
|
||||
* buffered write path:
|
||||
* sb_start_write -> i_mutex -> mmap_lock
|
||||
|
@ -103,8 +100,9 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
|
|||
* i_data_sem (rw)
|
||||
*
|
||||
* truncate:
|
||||
* sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
|
||||
* sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
|
||||
* sb_start_write -> i_mutex -> invalidate_lock (w) -> i_mmap_rwsem (w) ->
|
||||
* page lock
|
||||
* sb_start_write -> i_mutex -> invalidate_lock (w) -> transaction start ->
|
||||
* i_data_sem (rw)
|
||||
*
|
||||
* direct IO:
|
||||
|
@ -1360,7 +1358,6 @@ static void init_once(void *foo)
|
|||
INIT_LIST_HEAD(&ei->i_orphan);
|
||||
init_rwsem(&ei->xattr_sem);
|
||||
init_rwsem(&ei->i_data_sem);
|
||||
init_rwsem(&ei->i_mmap_sem);
|
||||
inode_init_once(&ei->vfs_inode);
|
||||
ext4_fc_init_inode(&ei->vfs_inode);
|
||||
}
|
||||
|
|
|
@ -11,14 +11,16 @@
|
|||
*/
|
||||
static inline void ext4_truncate_failed_write(struct inode *inode)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
|
||||
/*
|
||||
* We don't need to call ext4_break_layouts() because the blocks we
|
||||
* are truncating were never visible to userspace.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
truncate_inode_pages(inode->i_mapping, inode->i_size);
|
||||
filemap_invalidate_lock(mapping);
|
||||
truncate_inode_pages(mapping, inode->i_size);
|
||||
ext4_truncate(inode);
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
filemap_invalidate_unlock(mapping);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
Loading…
Reference in New Issue