From 3ca57bd620639eef8bae4adde2d58fc72109e7c8 Mon Sep 17 00:00:00 2001 From: Misono Tomohiro Date: Mon, 4 Jun 2018 16:41:07 +0900 Subject: [PATCH 1/4] btrfs: Check error of btrfs_iget in btrfs_search_path_in_tree_user The patch introducing the ioctl was not the latest version at the time of merging to the mainline and needs a fixup from this patch. Fixes: ba637a252d30 ("btrfs: Check error of btrfs_iget() in btrfs_search_path_in_tree_user") Signed-off-by: Misono Tomohiro Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d29992f7dc63..5556e9ea2a4b 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2438,6 +2438,10 @@ static int btrfs_search_path_in_tree_user(struct inode *inode, } temp_inode = btrfs_iget(sb, &key2, root, NULL); + if (IS_ERR(temp_inode)) { + ret = PTR_ERR(temp_inode); + goto out; + } ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC); iput(temp_inode); if (ret) { From 9d311e11fc1f5581d5ec2df0f87ea5a0193c41ad Mon Sep 17 00:00:00 2001 From: Robbie Ko Date: Mon, 7 May 2018 16:42:04 +0800 Subject: [PATCH 2/4] Btrfs: fiemap: pass correct bytenr when fm_extent_count is zero [BUG] fm_mapped_extents is not correct when fm_extent_count is 0 Like: # mount /dev/vdb5 /mnt/btrfs # dd if=/dev/zero bs=16K count=4 oflag=dsync of=/mnt/btrfs/file # xfs_io -c "fiemap -v" /mnt/btrfs/file /mnt/btrfs/file: EXT: FILE-OFFSET BLOCK-RANGE TOTAL FLAGS 0: [0..127]: 25088..25215 128 0x1 When user space wants to get the number of file extents, set fm_extent_count to 0 to run fiemap and then read fm_mapped_extents. In the above example, fiemap will return with fm_mapped_extents set to 4, but it should be 1 since there's only one entry in the output. [REASON] The problem seems to be that disko is only set if fieinfo->fi_extents_max is set. And this member is initialized, in the generic ioctl_fiemap function, to the value of used-passed fm_extent_count. So when the user passes 0 then fi_extent_max is also set to zero and this causes btrfs to not initialize disko at all. Eventually this leads emit_fiemap_extent being called with a bogus 'phys' argument preventing proper fiemap entries merging. [FIX] Move the disko initialization earlier in extent_fiemap making it independent of user-passed arguments, allowing emit_fiemap_extent to properly handle consecutive extent entries. Signed-off-by: Robbie Ko Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index af2f0408c6e4..8e4a7cdbc9f5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4545,7 +4545,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, offset_in_extent = em_start - em->start; em_end = extent_map_end(em); em_len = em_end - em_start; - disko = 0; + disko = em->block_start + offset_in_extent; flags = 0; /* @@ -4568,8 +4568,6 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 bytenr = em->block_start - (em->start - em->orig_start); - disko = em->block_start + offset_in_extent; - /* * As btrfs supports shared space, this information * can be exported to userspace tools via From a528a24150870c5c16cbbbec69dba7e992b08456 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Wed, 6 Jun 2018 19:54:44 +0530 Subject: [PATCH 3/4] btrfs: change return type of btrfs_page_mkwrite to vm_fault_t Use the new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Reference commit 1c8f422059ae ("mm: change return type to vm_fault_t") vmf_error() is the newly introduced inline function in 4.17-rc6. Signed-off-by: Souptick Joarder Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f4bf7874c24a..118346aceea9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3197,7 +3197,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, size_t size, struct bio *bio, unsigned long bio_flags); void btrfs_set_range_writeback(void *private_data, u64 start, u64 end); -int btrfs_page_mkwrite(struct vm_fault *vmf); +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 89b208201783..c12b7a6e534a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8872,7 +8872,7 @@ again: * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. */ -int btrfs_page_mkwrite(struct vm_fault *vmf) +vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); @@ -8884,7 +8884,8 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) char *kaddr; unsigned long zero_start; loff_t size; - int ret; + vm_fault_t ret; + int ret2; int reserved = 0; u64 reserved_space; u64 page_start; @@ -8906,17 +8907,14 @@ int btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + ret2 = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, reserved_space); - if (!ret) { - ret = file_update_time(vmf->vma->vm_file); + if (!ret2) { + ret2 = file_update_time(vmf->vma->vm_file); reserved = 1; } - if (ret) { - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else /* -ENOSPC, -EIO, etc */ - ret = VM_FAULT_SIGBUS; + if (ret2) { + ret = vmf_error(ret2); if (reserved) goto out; goto out_noreserve; @@ -8975,15 +8973,15 @@ again: EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, &cached_state); - ret = btrfs_set_extent_delalloc(inode, page_start, end, 0, + ret2 = btrfs_set_extent_delalloc(inode, page_start, end, 0, &cached_state, 0); - if (ret) { + if (ret2) { unlock_extent_cached(io_tree, page_start, page_end, &cached_state); ret = VM_FAULT_SIGBUS; goto out_unlock; } - ret = 0; + ret2 = 0; /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) @@ -9008,7 +9006,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); out_unlock: - if (!ret) { + if (!ret2) { btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); From ac0b4145d662a3b9e34085dea460fb06ede9b69b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 5 Jun 2018 12:36:56 +0800 Subject: [PATCH 4/4] btrfs: scrub: Don't use inode pages for device replace [BUG] Btrfs can create compressed extent without checksum (even though it shouldn't), and if we then try to replace device containing such extent, the result device will contain all the uncompressed data instead of the compressed one. Test case already submitted to fstests: https://patchwork.kernel.org/patch/10442353/ [CAUSE] When handling compressed extent without checksum, device replace will goe into copy_nocow_pages() function. In that function, btrfs will get all inodes referring to this data extents and then use find_or_create_page() to get pages direct from that inode. The problem here is, pages directly from inode are always uncompressed. And for compressed data extent, they mismatch with on-disk data. Thus this leads to corrupted compressed data extent written to replace device. [FIX] In this attempt, we could just remove the "optimization" branch, and let unified scrub_pages() to handle it. Although scrub_pages() won't bother reusing page cache, it will be a little slower, but it does the correct csum checking and won't cause such data corruption caused by "optimization". Note about the fix: this is the minimal fix that can be backported to older stable trees without conflicts. The whole callchain from copy_nocow_pages() can be deleted, and will be in followup patches. Fixes: ff023aac3119 ("Btrfs: add code to scrub to copy read data to another disk") CC: stable@vger.kernel.org # 4.4+ Reported-by: James Harvey Reviewed-by: James Harvey Signed-off-by: Qu Wenruo [ remove code removal, add note why ] Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index a59005862010..572306036477 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2799,7 +2799,7 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, have_csum = scrub_find_csum(sctx, logical, csum); if (have_csum == 0) ++sctx->stat.no_csum; - if (sctx->is_dev_replace && !have_csum) { + if (0 && sctx->is_dev_replace && !have_csum) { ret = copy_nocow_pages(sctx, logical, l, mirror_num, physical_for_dev_replace);