dax: don't abuse get_block mapping for endio callbacks
dax_fault() currently relies on the get_block callback to attach an io completion callback to the mapping buffer head so that it can run unwritten extent conversion after zeroing allocated blocks. Instead of this hack, pass the conversion callback directly into dax_fault() similar to the get_block callback. When the filesystem allocates unwritten extents, it will set the buffer_unwritten() flag, and hence the dax_fault code can call the completion function in the contexts where it is necessary without overloading the mapping buffer head. Note: The changes to ext4 to use this interface are suspect at best. In fact, the way ext4 did this end_io assignment in the first place looks suspect because it only set a completion callback when there wasn't already some other write() call taking place on the same inode. The ext4 end_io code looks rather intricate and fragile with all it's reference counting and passing to different contexts for modification via inode private pointers that aren't protected by locks... Signed-off-by: Dave Chinner <dchinner@redhat.com> Acked-by: Jan Kara <jack@suse.cz> Signed-off-by: Dave Chinner <david@fromorbit.com>
This commit is contained in:
parent
ec56b1f1fd
commit
e842f29039
21
fs/dax.c
21
fs/dax.c
|
@ -309,14 +309,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
|
|||
out:
|
||||
i_mmap_unlock_read(mapping);
|
||||
|
||||
if (bh->b_end_io)
|
||||
bh->b_end_io(bh, 1);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
get_block_t get_block)
|
||||
get_block_t get_block, dax_iodone_t complete_unwritten)
|
||||
{
|
||||
struct file *file = vma->vm_file;
|
||||
struct address_space *mapping = file->f_mapping;
|
||||
|
@ -417,7 +414,19 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
page_cache_release(page);
|
||||
}
|
||||
|
||||
/*
|
||||
* If we successfully insert the new mapping over an unwritten extent,
|
||||
* we need to ensure we convert the unwritten extent. If there is an
|
||||
* error inserting the mapping, the filesystem needs to leave it as
|
||||
* unwritten to prevent exposure of the stale underlying data to
|
||||
* userspace, but we still need to call the completion function so
|
||||
* the private resources on the mapping buffer can be released. We
|
||||
* indicate what the callback should do via the uptodate variable, same
|
||||
* as for normal BH based IO completions.
|
||||
*/
|
||||
error = dax_insert_mapping(inode, &bh, vma, vmf);
|
||||
if (buffer_unwritten(&bh))
|
||||
complete_unwritten(&bh, !error);
|
||||
|
||||
out:
|
||||
if (error == -ENOMEM)
|
||||
|
@ -445,7 +454,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
* fault handler for DAX files.
|
||||
*/
|
||||
int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
||||
get_block_t get_block)
|
||||
get_block_t get_block, dax_iodone_t complete_unwritten)
|
||||
{
|
||||
int result;
|
||||
struct super_block *sb = file_inode(vma->vm_file)->i_sb;
|
||||
|
@ -454,7 +463,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
|
|||
sb_start_pagefault(sb);
|
||||
file_update_time(vma->vm_file);
|
||||
}
|
||||
result = do_dax_fault(vma, vmf, get_block);
|
||||
result = do_dax_fault(vma, vmf, get_block, complete_unwritten);
|
||||
if (vmf->flags & FAULT_FLAG_WRITE)
|
||||
sb_end_pagefault(sb);
|
||||
|
||||
|
|
|
@ -28,12 +28,12 @@
|
|||
#ifdef CONFIG_FS_DAX
|
||||
static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_fault(vma, vmf, ext2_get_block);
|
||||
return dax_fault(vma, vmf, ext2_get_block, NULL);
|
||||
}
|
||||
|
||||
static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_mkwrite(vma, vmf, ext2_get_block);
|
||||
return dax_mkwrite(vma, vmf, ext2_get_block, NULL);
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext2_dax_vm_ops = {
|
||||
|
|
|
@ -192,15 +192,27 @@ out:
|
|||
}
|
||||
|
||||
#ifdef CONFIG_FS_DAX
|
||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct inode *inode = bh->b_assoc_map->host;
|
||||
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
|
||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||
int err;
|
||||
if (!uptodate)
|
||||
return;
|
||||
WARN_ON(!buffer_unwritten(bh));
|
||||
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
|
||||
}
|
||||
|
||||
static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_fault(vma, vmf, ext4_get_block);
|
||||
return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
||||
/* Is this the right get_block? */
|
||||
}
|
||||
|
||||
static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
|
||||
{
|
||||
return dax_mkwrite(vma, vmf, ext4_get_block);
|
||||
return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten);
|
||||
}
|
||||
|
||||
static const struct vm_operations_struct ext4_dax_vm_ops = {
|
||||
|
|
|
@ -656,18 +656,6 @@ has_zeroout:
|
|||
return retval;
|
||||
}
|
||||
|
||||
static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
|
||||
{
|
||||
struct inode *inode = bh->b_assoc_map->host;
|
||||
/* XXX: breaks on 32-bit > 16GB. Is that even supported? */
|
||||
loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
|
||||
int err;
|
||||
if (!uptodate)
|
||||
return;
|
||||
WARN_ON(!buffer_unwritten(bh));
|
||||
err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
|
||||
}
|
||||
|
||||
/* Maximum number of blocks we map for direct IO at once. */
|
||||
#define DIO_MAX_BLOCKS 4096
|
||||
|
||||
|
@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
|
|||
|
||||
map_bh(bh, inode->i_sb, map.m_pblk);
|
||||
bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
|
||||
if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
|
||||
if (IS_DAX(inode) && buffer_unwritten(bh)) {
|
||||
/*
|
||||
* dgc: I suspect unwritten conversion on ext4+DAX is
|
||||
* fundamentally broken here when there are concurrent
|
||||
* read/write in progress on this inode.
|
||||
*/
|
||||
WARN_ON_ONCE(io_end);
|
||||
bh->b_assoc_map = inode->i_mapping;
|
||||
bh->b_private = (void *)(unsigned long)iblock;
|
||||
bh->b_end_io = ext4_end_io_unwritten;
|
||||
}
|
||||
if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
|
||||
set_buffer_defer_completion(bh);
|
||||
|
|
|
@ -70,6 +70,7 @@ typedef int (get_block_t)(struct inode *inode, sector_t iblock,
|
|||
struct buffer_head *bh_result, int create);
|
||||
typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
||||
ssize_t bytes, void *private);
|
||||
typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
|
||||
|
||||
#define MAY_EXEC 0x00000001
|
||||
#define MAY_WRITE 0x00000002
|
||||
|
@ -2627,9 +2628,10 @@ ssize_t dax_do_io(struct kiocb *, struct inode *, struct iov_iter *, loff_t,
|
|||
int dax_clear_blocks(struct inode *, sector_t block, long size);
|
||||
int dax_zero_page_range(struct inode *, loff_t from, unsigned len, get_block_t);
|
||||
int dax_truncate_page(struct inode *, loff_t from, get_block_t);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t);
|
||||
int dax_fault(struct vm_area_struct *, struct vm_fault *, get_block_t,
|
||||
dax_iodone_t);
|
||||
int dax_pfn_mkwrite(struct vm_area_struct *, struct vm_fault *);
|
||||
#define dax_mkwrite(vma, vmf, gb) dax_fault(vma, vmf, gb)
|
||||
#define dax_mkwrite(vma, vmf, gb, iod) dax_fault(vma, vmf, gb, iod)
|
||||
|
||||
#ifdef CONFIG_BLOCK
|
||||
typedef void (dio_submit_t)(int rw, struct bio *bio, struct inode *inode,
|
||||
|
|
Loading…
Reference in New Issue