OpenCloudOS-Kernel/fs/btrfs/ioctl.c

5610 lines
138 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include <linux/kernel.h>
#include <linux/bio.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/fsnotify.h>
#include <linux/pagemap.h>
#include <linux/highmem.h>
#include <linux/time.h>
#include <linux/string.h>
#include <linux/backing-dev.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include <linux/writeback.h>
#include <linux/compat.h>
#include <linux/security.h>
#include <linux/xattr.h>
#include <linux/mm.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/uuid.h>
#include <linux/btrfs.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "print-tree.h"
#include "volumes.h"
#include "locking.h"
#include "inode-map.h"
#include "backref.h"
#include "rcu-string.h"
#include "send.h"
#include "dev-replace.h"
Btrfs: add support for inode properties This change adds infrastructure to allow for generic properties for inodes. Properties are name/value pairs that can be associated with inodes for different purposes. They are stored as xattrs with the prefix "btrfs." Properties can be inherited - this means when a directory inode has inheritable properties set, these are added to new inodes created under that directory. Further, subvolumes can also have properties associated with them, and they can be inherited from their parent subvolume. Naturally, directory properties have priority over subvolume properties (in practice a subvolume property is just a regular property associated with the root inode, objectid 256, of the subvolume's fs tree). This change also adds one specific property implementation, named "compression", whose values can be "lzo" or "zlib" and it's an inheritable property. The corresponding changes to btrfs-progs were also implemented. A patch with xfstests for this feature will follow once there's agreement on this change/feature. Further, the script at the bottom of this commit message was used to do some benchmarks to measure any performance penalties of this feature. Basically the tests correspond to: Test 1 - create a filesystem and mount it with compress-force=lzo, then sequentially create N files of 64Kb each, measure how long it took to create the files, unmount the filesystem, mount the filesystem and perform an 'ls -lha' against the test directory holding the N files, and report the time the command took. Test 2 - create a filesystem and don't use any compression option when mounting it - instead set the compression property of the subvolume's root to 'lzo'. Then create N files of 64Kb, and report the time it took. The unmount the filesystem, mount it again and perform an 'ls -lha' like in the former test. This means every single file ends up with a property (xattr) associated to it. Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the compression property, have no real effect other than adding more work when inheriting properties and taking more btree leaf space. Test 4 - same as test 3 but with 10 properties per file. Results (in seconds, and averages of 5 runs each), for different N numbers of files follow. * Without properties (test 1) file creation time ls -lha time 10 000 files 3.49 0.76 100 000 files 47.19 8.37 1 000 000 files 518.51 107.06 * With 1 property (compression property set to lzo - test 2) file creation time ls -lha time 10 000 files 3.63 0.93 100 000 files 48.56 9.74 1 000 000 files 537.72 125.11 * With 4 properties (test 3) file creation time ls -lha time 10 000 files 3.94 1.20 100 000 files 52.14 11.48 1 000 000 files 572.70 142.13 * With 10 properties (test 4) file creation time ls -lha time 10 000 files 4.61 1.35 100 000 files 58.86 13.83 1 000 000 files 656.01 177.61 The increased latencies with properties are essencialy because of: *) When creating an inode, we now synchronously write 1 more item (an xattr item) for each property inherited from the parent dir (or subvolume). This could be done in an asynchronous way such as we do for dir intex items (delayed-inode.c), which could help reduce the file creation latency; *) With properties, we now have larger fs trees. For this particular test each xattr item uses 75 bytes of leaf space in the fs tree. This could be less by using a new item for xattr items, instead of the current btrfs_dir_item, since we could cut the 'location' and 'type' fields (saving 18 bytes) and maybe 'transid' too (saving a total of 26 bytes per xattr item) from the btrfs_dir_item type. Also tried batching the xattr insertions (ignoring proper hash collision handling, since it didn't exist) when creating files that inherit properties from their parent inode/subvolume, but the end results were (surprisingly) essentially the same. Test script: $ cat test.pl #!/usr/bin/perl -w use strict; use Time::HiRes qw(time); use constant NUM_FILES => 10_000; use constant FILE_SIZES => (64 * 1024); use constant DEV => '/dev/sdb4'; use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev'; use constant TEST_DIR => (MNT_POINT . '/testdir'); system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!"; # following line for testing without properties #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!"; # following 2 lines for testing with properties system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!"; system("mkdir", TEST_DIR) == 0 or die "mkdir failed!"; my ($t1, $t2); $t1 = time(); for (my $i = 1; $i <= NUM_FILES; $i++) { my $p = TEST_DIR . '/file_' . $i; open(my $f, '>', $p) or die "Error opening file!"; $f->autoflush(1); for (my $j = 0; $j < FILE_SIZES; $j += 4096) { print $f ('A' x 4096) or die "Error writing to file!"; } close($f); } $t2 = time(); print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; $t1 = time(); system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!"; $t2 = time(); print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
#include "props.h"
#include "sysfs.h"
Btrfs: rework qgroup accounting Currently qgroups account for space by intercepting delayed ref updates to fs trees. It does this by adding sequence numbers to delayed ref updates so that it can figure out how the tree looked before the update so we can adjust the counters properly. The problem with this is that it does not allow delayed refs to be merged, so if you say are defragging an extent with 5k snapshots pointing to it we will thrash the delayed ref lock because we need to go back and manually merge these things together. Instead we want to process quota changes when we know they are going to happen, like when we first allocate an extent, we free a reference for an extent, we add new references etc. This patch accomplishes this by only adding qgroup operations for real ref changes. We only modify the sequence number when we need to lookup roots for bytenrs, this reduces the amount of churn on the sequence number and allows us to merge delayed refs as we add them most of the time. This patch encompasses a bunch of architectural changes 1) qgroup ref operations: instead of tracking qgroup operations through the delayed refs we simply add new ref operations whenever we notice that we need to when we've modified the refs themselves. 2) tree mod seq: we no longer have this separation of major/minor counters. this makes the sequence number stuff much more sane and we can remove some locking that was needed to protect the counter. 3) delayed ref seq: we now read the tree mod seq number and use that as our sequence. This means each new delayed ref doesn't have it's own unique sequence number, rather whenever we go to lookup backrefs we inc the sequence number so we can make sure to keep any new operations from screwing up our world view at that given point. This allows us to merge delayed refs during runtime. With all of these changes the delayed ref stuff is a little saner and the qgroup accounting stuff no longer goes negative in some cases like it was before. Thanks, Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-05-14 08:30:47 +08:00
#include "qgroup.h"
Btrfs: fix unreplayable log after snapshot delete + parent dir fsync If we delete a snapshot, fsync its parent directory and crash/power fail before the next transaction commit, on the next mount when we attempt to replay the log tree of the root containing the parent directory we will fail and prevent the filesystem from mounting, which is solvable by wiping out the log trees with the btrfs-zero-log tool but very inconvenient as we will lose any data and metadata fsynced before the parent directory was fsynced. For example: $ mkfs.btrfs -f /dev/sdc $ mount /dev/sdc /mnt $ mkdir /mnt/testdir $ btrfs subvolume snapshot /mnt /mnt/testdir/snap $ btrfs subvolume delete /mnt/testdir/snap $ xfs_io -c "fsync" /mnt/testdir < crash / power failure and reboot > $ mount /dev/sdc /mnt mount: mount(2) failed: No such file or directory And in dmesg/syslog we get the following message and trace: [192066.361162] BTRFS info (device dm-0): failed to delete reference to snap, inode 257 parent 257 [192066.363010] ------------[ cut here ]------------ [192066.365268] WARNING: CPU: 4 PID: 5130 at fs/btrfs/inode.c:3986 __btrfs_unlink_inode+0x17a/0x354 [btrfs]() [192066.367250] BTRFS: Transaction aborted (error -2) [192066.368401] Modules linked in: btrfs dm_flakey dm_mod ppdev sha256_generic xor raid6_pq hmac drbg ansi_cprng aesni_intel acpi_cpufreq tpm_tis aes_x86_64 tpm ablk_helper evdev cryptd sg parport_pc i2c_piix4 psmouse lrw parport i2c_core pcspkr gf128mul processor serio_raw glue_helper button loop autofs4 ext4 crc16 mbcache jbd2 sd_mod sr_mod cdrom ata_generic virtio_scsi ata_piix libata virtio_pci virtio_ring crc32c_intel scsi_mod e1000 virtio floppy [last unloaded: btrfs] [192066.377154] CPU: 4 PID: 5130 Comm: mount Tainted: G W 4.4.0-rc6-btrfs-next-20+ #1 [192066.378875] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS by qemu-project.org 04/01/2014 [192066.380889] 0000000000000000 ffff880143923670 ffffffff81257570 ffff8801439236b8 [192066.382561] ffff8801439236a8 ffffffff8104ec07 ffffffffa039dc2c 00000000fffffffe [192066.384191] ffff8801ed31d000 ffff8801b9fc9c88 ffff8801086875e0 ffff880143923710 [192066.385827] Call Trace: [192066.386373] [<ffffffff81257570>] dump_stack+0x4e/0x79 [192066.387387] [<ffffffff8104ec07>] warn_slowpath_common+0x99/0xb2 [192066.388429] [<ffffffffa039dc2c>] ? __btrfs_unlink_inode+0x17a/0x354 [btrfs] [192066.389236] [<ffffffff8104ec68>] warn_slowpath_fmt+0x48/0x50 [192066.389884] [<ffffffffa039dc2c>] __btrfs_unlink_inode+0x17a/0x354 [btrfs] [192066.390621] [<ffffffff81184b55>] ? iput+0xb0/0x266 [192066.391200] [<ffffffffa039ea25>] btrfs_unlink_inode+0x1c/0x3d [btrfs] [192066.391930] [<ffffffffa03ca623>] check_item_in_log+0x1fe/0x29b [btrfs] [192066.392715] [<ffffffffa03ca827>] replay_dir_deletes+0x167/0x1cf [btrfs] [192066.393510] [<ffffffffa03cccc7>] replay_one_buffer+0x417/0x570 [btrfs] [192066.394241] [<ffffffffa03ca164>] walk_up_log_tree+0x10e/0x1dc [btrfs] [192066.394958] [<ffffffffa03cac72>] walk_log_tree+0xa5/0x190 [btrfs] [192066.395628] [<ffffffffa03ce8b8>] btrfs_recover_log_trees+0x239/0x32c [btrfs] [192066.396790] [<ffffffffa03cc8b0>] ? replay_one_extent+0x50a/0x50a [btrfs] [192066.397891] [<ffffffffa0394041>] open_ctree+0x1d8b/0x2167 [btrfs] [192066.398897] [<ffffffffa03706e1>] btrfs_mount+0x5ef/0x729 [btrfs] [192066.399823] [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf [192066.400739] [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3 [192066.401700] [<ffffffff811714b9>] mount_fs+0x67/0x131 [192066.402482] [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde [192066.403930] [<ffffffffa03702bd>] btrfs_mount+0x1cb/0x729 [btrfs] [192066.404831] [<ffffffff8108ad98>] ? trace_hardirqs_on+0xd/0xf [192066.405726] [<ffffffff8108959b>] ? lockdep_init_map+0xb9/0x1b3 [192066.406621] [<ffffffff811714b9>] mount_fs+0x67/0x131 [192066.407401] [<ffffffff81188560>] vfs_kern_mount+0x6c/0xde [192066.408247] [<ffffffff8118ae36>] do_mount+0x893/0x9d2 [192066.409047] [<ffffffff8113009b>] ? strndup_user+0x3f/0x8c [192066.409842] [<ffffffff8118b187>] SyS_mount+0x75/0xa1 [192066.410621] [<ffffffff8147e517>] entry_SYSCALL_64_fastpath+0x12/0x6b [192066.411572] ---[ end trace 2de42126c1e0a0f0 ]--- [192066.412344] BTRFS: error (device dm-0) in __btrfs_unlink_inode:3986: errno=-2 No such entry [192066.413748] BTRFS: error (device dm-0) in btrfs_replay_log:2464: errno=-2 No such entry (Failed to recover log tree) [192066.415458] BTRFS error (device dm-0): cleaner transaction attach returned -30 [192066.444613] BTRFS: open_ctree failed This happens because when we are replaying the log and processing the directory entry pointing to the snapshot in the subvolume tree, we treat its btrfs_dir_item item as having a location with a key type matching BTRFS_INODE_ITEM_KEY, which is wrong because the type matches BTRFS_ROOT_ITEM_KEY and therefore must be processed differently, as the object id refers to a root number and not to an inode in the root containing the parent directory. So fix this by triggering a transaction commit if an fsync against the parent directory is requested after deleting a snapshot. This is the simplest approach for a rare use case. Some alternative that avoids the transaction commit would require more code to explicitly delete the snapshot at log replay time (factoring out common code from ioctl.c: btrfs_ioctl_snap_destroy()), special care at fsync time to remove the log tree of the snapshot's root from the log root of the root of tree roots, amongst other steps. A test case for xfstests that triggers the issue follows. seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { _cleanup_flakey cd / rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter . ./common/dmflakey # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_dm_target flakey _require_metadata_journaling $SCRATCH_DEV rm -f $seqres.full _scratch_mkfs >>$seqres.full 2>&1 _init_flakey _mount_flakey # Create a snapshot at the root of our filesystem (mount point path), delete it, # fsync the mount point path, crash and mount to replay the log. This should # succeed and after the filesystem is mounted the snapshot should not be visible # anymore. _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/snap1 _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/snap1 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT _flakey_drop_and_remount [ -e $SCRATCH_MNT/snap1 ] && \ echo "Snapshot snap1 still exists after log replay" # Similar scenario as above, but this time the snapshot is created inside a # directory and not directly under the root (mount point path). mkdir $SCRATCH_MNT/testdir _run_btrfs_util_prog subvolume snapshot $SCRATCH_MNT $SCRATCH_MNT/testdir/snap2 _run_btrfs_util_prog subvolume delete $SCRATCH_MNT/testdir/snap2 $XFS_IO_PROG -c "fsync" $SCRATCH_MNT/testdir _flakey_drop_and_remount [ -e $SCRATCH_MNT/testdir/snap2 ] && \ echo "Snapshot snap2 still exists after log replay" _unmount_flakey echo "Silence is golden" status=0 exit Signed-off-by: Filipe Manana <fdmanana@suse.com> Tested-by: Liu Bo <bo.li.liu@oracle.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
2016-02-10 18:42:25 +08:00
#include "tree-log.h"
#include "compression.h"
#include "space-info.h"
#include "delalloc-space.h"
#include "block-group.h"
#ifdef CONFIG_64BIT
/* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
* structures are incorrect, as the timespec structure from userspace
* is 4 bytes too small. We define these alternatives here to teach
* the kernel about the 32-bit struct packing.
*/
struct btrfs_ioctl_timespec_32 {
__u64 sec;
__u32 nsec;
} __attribute__ ((__packed__));
struct btrfs_ioctl_received_subvol_args_32 {
char uuid[BTRFS_UUID_SIZE]; /* in */
__u64 stransid; /* in */
__u64 rtransid; /* out */
struct btrfs_ioctl_timespec_32 stime; /* in */
struct btrfs_ioctl_timespec_32 rtime; /* out */
__u64 flags; /* in */
__u64 reserved[16]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
struct btrfs_ioctl_received_subvol_args_32)
#endif
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 {
__s64 send_fd; /* in */
__u64 clone_sources_count; /* in */
compat_uptr_t clone_sources; /* in */
__u64 parent_root; /* in */
__u64 flags; /* in */
__u64 reserved[4]; /* in */
} __attribute__ ((__packed__));
#define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
struct btrfs_ioctl_send_args_32)
#endif
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff,
int no_time_update);
/* Mask out flags that are inappropriate for the given type of inode. */
static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
unsigned int flags)
{
if (S_ISDIR(inode->i_mode))
return flags;
else if (S_ISREG(inode->i_mode))
return flags & ~FS_DIRSYNC_FL;
else
return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
}
/*
* Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
* ioctl.
*/
static unsigned int btrfs_inode_flags_to_fsflags(unsigned int flags)
{
unsigned int iflags = 0;
if (flags & BTRFS_INODE_SYNC)
iflags |= FS_SYNC_FL;
if (flags & BTRFS_INODE_IMMUTABLE)
iflags |= FS_IMMUTABLE_FL;
if (flags & BTRFS_INODE_APPEND)
iflags |= FS_APPEND_FL;
if (flags & BTRFS_INODE_NODUMP)
iflags |= FS_NODUMP_FL;
if (flags & BTRFS_INODE_NOATIME)
iflags |= FS_NOATIME_FL;
if (flags & BTRFS_INODE_DIRSYNC)
iflags |= FS_DIRSYNC_FL;
if (flags & BTRFS_INODE_NODATACOW)
iflags |= FS_NOCOW_FL;
if (flags & BTRFS_INODE_NOCOMPRESS)
iflags |= FS_NOCOMP_FL;
else if (flags & BTRFS_INODE_COMPRESS)
iflags |= FS_COMPR_FL;
return iflags;
}
/*
* Update inode->i_flags based on the btrfs internal flags.
*/
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
{
struct btrfs_inode *binode = BTRFS_I(inode);
unsigned int new_fl = 0;
if (binode->flags & BTRFS_INODE_SYNC)
new_fl |= S_SYNC;
if (binode->flags & BTRFS_INODE_IMMUTABLE)
new_fl |= S_IMMUTABLE;
if (binode->flags & BTRFS_INODE_APPEND)
new_fl |= S_APPEND;
if (binode->flags & BTRFS_INODE_NOATIME)
new_fl |= S_NOATIME;
if (binode->flags & BTRFS_INODE_DIRSYNC)
new_fl |= S_DIRSYNC;
set_mask_bits(&inode->i_flags,
S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC,
new_fl);
}
static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
{
struct btrfs_inode *binode = BTRFS_I(file_inode(file));
unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
if (copy_to_user(arg, &flags, sizeof(flags)))
return -EFAULT;
return 0;
}
/* Check if @flags are a supported and valid set of FS_*_FL flags */
static int check_fsflags(unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
FS_SYNC_FL | FS_DIRSYNC_FL | \
FS_NOCOMP_FL | FS_COMPR_FL |
FS_NOCOW_FL))
return -EOPNOTSUPP;
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
return -EINVAL;
return 0;
}
static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
u32 binode_flags = binode->flags;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (btrfs_root_readonly(root))
return -EROFS;
if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
ret = check_fsflags(fsflags);
if (ret)
return ret;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(inode);
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
if (ret)
goto out_unlock;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
else
binode_flags &= ~BTRFS_INODE_SYNC;
if (fsflags & FS_IMMUTABLE_FL)
binode_flags |= BTRFS_INODE_IMMUTABLE;
else
binode_flags &= ~BTRFS_INODE_IMMUTABLE;
if (fsflags & FS_APPEND_FL)
binode_flags |= BTRFS_INODE_APPEND;
else
binode_flags &= ~BTRFS_INODE_APPEND;
if (fsflags & FS_NODUMP_FL)
binode_flags |= BTRFS_INODE_NODUMP;
else
binode_flags &= ~BTRFS_INODE_NODUMP;
if (fsflags & FS_NOATIME_FL)
binode_flags |= BTRFS_INODE_NOATIME;
else
binode_flags &= ~BTRFS_INODE_NOATIME;
if (fsflags & FS_DIRSYNC_FL)
binode_flags |= BTRFS_INODE_DIRSYNC;
else
binode_flags &= ~BTRFS_INODE_DIRSYNC;
if (fsflags & FS_NOCOW_FL) {
if (S_ISREG(inode->i_mode)) {
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
/*
* It's safe to turn csums off here, no extents exist.
* Otherwise we want the flag to reflect the real COW
* status of the file and will not set it.
*/
if (inode->i_size == 0)
binode_flags |= BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM;
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
} else {
binode_flags |= BTRFS_INODE_NODATACOW;
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
}
} else {
/*
* Revert back under same assumptions as above
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
*/
if (S_ISREG(inode->i_mode)) {
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
if (inode->i_size == 0)
binode_flags &= ~(BTRFS_INODE_NODATACOW |
BTRFS_INODE_NODATASUM);
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
} else {
binode_flags &= ~BTRFS_INODE_NODATACOW;
btrfs: allow setting NOCOW for a zero sized file via ioctl Hi, the patch si simple, but it has user visible impact and I'm not quite sure how to resolve it. In short, $subj says it, chattr -C supports it and we want to use it. The conditions that acutally allow to change the NOCOW flag are clear. What if I try to set the flag on a file that is not empty? Options: 1) whole ioctl will fail, EINVAL 2.1) ioctl will succeed, the NOCOW flag will be silently removed, but the file will stay COW-ed and checksummed 2.2) ioctl will succeed, flag will not be removed and a syslog message will warn that the COW flag has not been changed 2.2.1) dtto, no syslog message Man page of chattr states that "If it is set on a file which already has data blocks, it is undefined when the blocks assigned to the file will be fully stable." Yes, it's undefined and with current implementation it'll never happen. So from this end, the user cannot expect anything. I'm trying to find a reasonable behaviour, so that a command like 'chattr -R -aijS +C' to tweak a broad set of flags in a deep directory does not fail unnecessarily and does not pollute the log. My personal preference is 2.2.1, but my dev's oppinion is skewed, not counting the fact that I know the code and otherwise would look there before consulting the documentation. The patch implements 2.2.1. david -------------8<------------------- From: David Sterba <dsterba@suse.cz> It's safe to turn off checksums for a zero sized file. http://thread.gmane.org/gmane.comp.file-systems.btrfs/18030 "We cannot switch on NODATASUM for a file that already has extents that are checksummed. The invariant here is that either all the extents or none are checksummed. Theoretically it's possible to add/remove all checksums from a given file, but it's a potentially longtime operation, the file has to be in some intermediate state where the checksums partially exist but have to be ignored (for the csum->nocsum) until the file is fully converted, this brings more special cases to extent handling, it has to survive power failure and remain consistent, and probably needs to be restarted after next mount." Signed-off-by: David Sterba <dsterba@suse.cz>
2012-09-07 19:56:55 +08:00
}
}
/*
* The COMPRESS flag can only be changed by users, while the NOCOMPRESS
* flag may be changed automatically if compression code won't make
* things smaller.
*/
if (fsflags & FS_NOCOMP_FL) {
binode_flags &= ~BTRFS_INODE_COMPRESS;
binode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
Btrfs: add support for inode properties This change adds infrastructure to allow for generic properties for inodes. Properties are name/value pairs that can be associated with inodes for different purposes. They are stored as xattrs with the prefix "btrfs." Properties can be inherited - this means when a directory inode has inheritable properties set, these are added to new inodes created under that directory. Further, subvolumes can also have properties associated with them, and they can be inherited from their parent subvolume. Naturally, directory properties have priority over subvolume properties (in practice a subvolume property is just a regular property associated with the root inode, objectid 256, of the subvolume's fs tree). This change also adds one specific property implementation, named "compression", whose values can be "lzo" or "zlib" and it's an inheritable property. The corresponding changes to btrfs-progs were also implemented. A patch with xfstests for this feature will follow once there's agreement on this change/feature. Further, the script at the bottom of this commit message was used to do some benchmarks to measure any performance penalties of this feature. Basically the tests correspond to: Test 1 - create a filesystem and mount it with compress-force=lzo, then sequentially create N files of 64Kb each, measure how long it took to create the files, unmount the filesystem, mount the filesystem and perform an 'ls -lha' against the test directory holding the N files, and report the time the command took. Test 2 - create a filesystem and don't use any compression option when mounting it - instead set the compression property of the subvolume's root to 'lzo'. Then create N files of 64Kb, and report the time it took. The unmount the filesystem, mount it again and perform an 'ls -lha' like in the former test. This means every single file ends up with a property (xattr) associated to it. Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the compression property, have no real effect other than adding more work when inheriting properties and taking more btree leaf space. Test 4 - same as test 3 but with 10 properties per file. Results (in seconds, and averages of 5 runs each), for different N numbers of files follow. * Without properties (test 1) file creation time ls -lha time 10 000 files 3.49 0.76 100 000 files 47.19 8.37 1 000 000 files 518.51 107.06 * With 1 property (compression property set to lzo - test 2) file creation time ls -lha time 10 000 files 3.63 0.93 100 000 files 48.56 9.74 1 000 000 files 537.72 125.11 * With 4 properties (test 3) file creation time ls -lha time 10 000 files 3.94 1.20 100 000 files 52.14 11.48 1 000 000 files 572.70 142.13 * With 10 properties (test 4) file creation time ls -lha time 10 000 files 4.61 1.35 100 000 files 58.86 13.83 1 000 000 files 656.01 177.61 The increased latencies with properties are essencialy because of: *) When creating an inode, we now synchronously write 1 more item (an xattr item) for each property inherited from the parent dir (or subvolume). This could be done in an asynchronous way such as we do for dir intex items (delayed-inode.c), which could help reduce the file creation latency; *) With properties, we now have larger fs trees. For this particular test each xattr item uses 75 bytes of leaf space in the fs tree. This could be less by using a new item for xattr items, instead of the current btrfs_dir_item, since we could cut the 'location' and 'type' fields (saving 18 bytes) and maybe 'transid' too (saving a total of 26 bytes per xattr item) from the btrfs_dir_item type. Also tried batching the xattr insertions (ignoring proper hash collision handling, since it didn't exist) when creating files that inherit properties from their parent inode/subvolume, but the end results were (surprisingly) essentially the same. Test script: $ cat test.pl #!/usr/bin/perl -w use strict; use Time::HiRes qw(time); use constant NUM_FILES => 10_000; use constant FILE_SIZES => (64 * 1024); use constant DEV => '/dev/sdb4'; use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev'; use constant TEST_DIR => (MNT_POINT . '/testdir'); system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!"; # following line for testing without properties #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!"; # following 2 lines for testing with properties system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!"; system("mkdir", TEST_DIR) == 0 or die "mkdir failed!"; my ($t1, $t2); $t1 = time(); for (my $i = 1; $i <= NUM_FILES; $i++) { my $p = TEST_DIR . '/file_' . $i; open(my $f, '>', $p) or die "Error opening file!"; $f->autoflush(1); for (my $j = 0; $j < FILE_SIZES; $j += 4096) { print $f ('A' x 4096) or die "Error writing to file!"; } close($f); } $t2 = time(); print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; $t1 = time(); system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!"; $t2 = time(); print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
goto out_unlock;
}
binode_flags |= BTRFS_INODE_COMPRESS;
binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
Btrfs: add support for inode properties This change adds infrastructure to allow for generic properties for inodes. Properties are name/value pairs that can be associated with inodes for different purposes. They are stored as xattrs with the prefix "btrfs." Properties can be inherited - this means when a directory inode has inheritable properties set, these are added to new inodes created under that directory. Further, subvolumes can also have properties associated with them, and they can be inherited from their parent subvolume. Naturally, directory properties have priority over subvolume properties (in practice a subvolume property is just a regular property associated with the root inode, objectid 256, of the subvolume's fs tree). This change also adds one specific property implementation, named "compression", whose values can be "lzo" or "zlib" and it's an inheritable property. The corresponding changes to btrfs-progs were also implemented. A patch with xfstests for this feature will follow once there's agreement on this change/feature. Further, the script at the bottom of this commit message was used to do some benchmarks to measure any performance penalties of this feature. Basically the tests correspond to: Test 1 - create a filesystem and mount it with compress-force=lzo, then sequentially create N files of 64Kb each, measure how long it took to create the files, unmount the filesystem, mount the filesystem and perform an 'ls -lha' against the test directory holding the N files, and report the time the command took. Test 2 - create a filesystem and don't use any compression option when mounting it - instead set the compression property of the subvolume's root to 'lzo'. Then create N files of 64Kb, and report the time it took. The unmount the filesystem, mount it again and perform an 'ls -lha' like in the former test. This means every single file ends up with a property (xattr) associated to it. Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the compression property, have no real effect other than adding more work when inheriting properties and taking more btree leaf space. Test 4 - same as test 3 but with 10 properties per file. Results (in seconds, and averages of 5 runs each), for different N numbers of files follow. * Without properties (test 1) file creation time ls -lha time 10 000 files 3.49 0.76 100 000 files 47.19 8.37 1 000 000 files 518.51 107.06 * With 1 property (compression property set to lzo - test 2) file creation time ls -lha time 10 000 files 3.63 0.93 100 000 files 48.56 9.74 1 000 000 files 537.72 125.11 * With 4 properties (test 3) file creation time ls -lha time 10 000 files 3.94 1.20 100 000 files 52.14 11.48 1 000 000 files 572.70 142.13 * With 10 properties (test 4) file creation time ls -lha time 10 000 files 4.61 1.35 100 000 files 58.86 13.83 1 000 000 files 656.01 177.61 The increased latencies with properties are essencialy because of: *) When creating an inode, we now synchronously write 1 more item (an xattr item) for each property inherited from the parent dir (or subvolume). This could be done in an asynchronous way such as we do for dir intex items (delayed-inode.c), which could help reduce the file creation latency; *) With properties, we now have larger fs trees. For this particular test each xattr item uses 75 bytes of leaf space in the fs tree. This could be less by using a new item for xattr items, instead of the current btrfs_dir_item, since we could cut the 'location' and 'type' fields (saving 18 bytes) and maybe 'transid' too (saving a total of 26 bytes per xattr item) from the btrfs_dir_item type. Also tried batching the xattr insertions (ignoring proper hash collision handling, since it didn't exist) when creating files that inherit properties from their parent inode/subvolume, but the end results were (surprisingly) essentially the same. Test script: $ cat test.pl #!/usr/bin/perl -w use strict; use Time::HiRes qw(time); use constant NUM_FILES => 10_000; use constant FILE_SIZES => (64 * 1024); use constant DEV => '/dev/sdb4'; use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev'; use constant TEST_DIR => (MNT_POINT . '/testdir'); system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!"; # following line for testing without properties #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!"; # following 2 lines for testing with properties system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!"; system("mkdir", TEST_DIR) == 0 or die "mkdir failed!"; my ($t1, $t2); $t1 = time(); for (my $i = 1; $i <= NUM_FILES; $i++) { my $p = TEST_DIR . '/file_' . $i; open(my $f, '>', $p) or die "Error opening file!"; $f->autoflush(1); for (my $j = 0; $j < FILE_SIZES; $j += 4096) { print $f ('A' x 4096) or die "Error writing to file!"; } close($f); } $t2 = time(); print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; $t1 = time(); system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!"; $t2 = time(); print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
comp = btrfs_compress_type2str(fs_info->compress_type);
if (!comp || comp[0] == 0)
comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
} else {
binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
}
/*
* 1 for inode item
* 2 for properties
*/
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_unlock;
}
if (comp) {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
strlen(comp), 0);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
} else {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
0, 0);
if (ret && ret != -ENODATA) {
btrfs_abort_transaction(trans, ret);
goto out_end_trans;
}
}
binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
out_end_trans:
btrfs_end_transaction(trans);
out_unlock:
inode_unlock(inode);
mnt_drop_write_file(file);
return ret;
}
/*
* Translate btrfs internal inode flags to xflags as expected by the
* FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
* silently dropped.
*/
static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
{
unsigned int xflags = 0;
if (flags & BTRFS_INODE_APPEND)
xflags |= FS_XFLAG_APPEND;
if (flags & BTRFS_INODE_IMMUTABLE)
xflags |= FS_XFLAG_IMMUTABLE;
if (flags & BTRFS_INODE_NOATIME)
xflags |= FS_XFLAG_NOATIME;
if (flags & BTRFS_INODE_NODUMP)
xflags |= FS_XFLAG_NODUMP;
if (flags & BTRFS_INODE_SYNC)
xflags |= FS_XFLAG_SYNC;
return xflags;
}
/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
static int check_xflags(unsigned int flags)
{
if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
return -EOPNOTSUPP;
return 0;
}
/*
* Set the xflags from the internal inode flags. The remaining items of fsxattr
* are zeroed.
*/
static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
{
struct btrfs_inode *binode = BTRFS_I(file_inode(file));
struct fsxattr fa;
simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
if (copy_to_user(arg, &fa, sizeof(fa)))
return -EFAULT;
return 0;
}
static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
struct fsxattr fa, old_fa;
unsigned old_flags;
unsigned old_i_flags;
int ret = 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
if (btrfs_root_readonly(root))
return -EROFS;
if (copy_from_user(&fa, arg, sizeof(fa)))
return -EFAULT;
ret = check_xflags(fa.fsx_xflags);
if (ret)
return ret;
if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
return -EOPNOTSUPP;
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(inode);
old_flags = binode->flags;
old_i_flags = inode->i_flags;
simple_fill_fsxattr(&old_fa,
btrfs_inode_flags_to_xflags(binode->flags));
ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
if (ret)
goto out_unlock;
if (fa.fsx_xflags & FS_XFLAG_SYNC)
binode->flags |= BTRFS_INODE_SYNC;
else
binode->flags &= ~BTRFS_INODE_SYNC;
if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
binode->flags |= BTRFS_INODE_IMMUTABLE;
else
binode->flags &= ~BTRFS_INODE_IMMUTABLE;
if (fa.fsx_xflags & FS_XFLAG_APPEND)
binode->flags |= BTRFS_INODE_APPEND;
else
binode->flags &= ~BTRFS_INODE_APPEND;
if (fa.fsx_xflags & FS_XFLAG_NODUMP)
binode->flags |= BTRFS_INODE_NODUMP;
else
binode->flags &= ~BTRFS_INODE_NODUMP;
if (fa.fsx_xflags & FS_XFLAG_NOATIME)
binode->flags |= BTRFS_INODE_NOATIME;
else
binode->flags &= ~BTRFS_INODE_NOATIME;
/* 1 item for the inode */
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_unlock;
}
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
inode->i_ctime = current_time(inode);
ret = btrfs_update_inode(trans, root, inode);
btrfs_end_transaction(trans);
out_unlock:
if (ret) {
binode->flags = old_flags;
inode->i_flags = old_i_flags;
}
inode_unlock(inode);
mnt_drop_write_file(file);
return ret;
}
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
struct inode *inode = file_inode(file);
return put_user(inode->i_generation, arg);
}
static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_device *device;
struct request_queue *q;
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/*
* If the fs is mounted with nologreplay, which requires it to be
* mounted in RO mode as well, we can not allow discard on free space
* inside block groups, because log trees refer to extents that are not
* pinned in a block group's free space cache (pinning the extents is
* precisely the first phase of replaying a log tree).
*/
if (btrfs_test_opt(fs_info, NOLOGREPLAY))
return -EROFS;
rcu_read_lock();
list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
dev_list) {
if (!device->bdev)
continue;
q = bdev_get_queue(device->bdev);
if (blk_queue_discard(q)) {
num_devices++;
minlen = min_t(u64, q->limits.discard_granularity,
minlen);
}
}
rcu_read_unlock();
if (!num_devices)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
/*
* NOTE: Don't truncate the range using super->total_bytes. Bytenr of
* block group is in the logical address space, which can be any
* sectorsize aligned bytenr in the range [0, U64_MAX].
*/
if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
return ret;
if (copy_to_user(arg, &range, sizeof(range)))
return -EFAULT;
return 0;
}
int __pure btrfs_is_empty_uuid(u8 *uuid)
{
int i;
for (i = 0; i < BTRFS_UUID_SIZE; i++) {
if (uuid[i])
return 0;
}
return 1;
}
static noinline int create_subvol(struct inode *dir,
struct dentry *dentry,
const char *name, int namelen,
u64 *async_transid,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct btrfs_trans_handle *trans;
struct btrfs_key key;
struct btrfs_root_item *root_item;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
vfs: change inode times to use struct timespec64 struct timespec is not y2038 safe. Transition vfs to use y2038 safe struct timespec64 instead. The change was made with the help of the following cocinelle script. This catches about 80% of the changes. All the header file and logic changes are included in the first 5 rules. The rest are trivial substitutions. I avoid changing any of the function signatures or any other filesystem specific data structures to keep the patch simple for review. The script can be a little shorter by combining different cases. But, this version was sufficient for my usecase. virtual patch @ depends on patch @ identifier now; @@ - struct timespec + struct timespec64 current_time ( ... ) { - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); ... - return timespec_trunc( + return timespec64_trunc( ... ); } @ depends on patch @ identifier xtime; @@ struct \( iattr \| inode \| kstat \) { ... - struct timespec xtime; + struct timespec64 xtime; ... } @ depends on patch @ identifier t; @@ struct inode_operations { ... int (*update_time) (..., - struct timespec t, + struct timespec64 t, ...); ... } @ depends on patch @ identifier t; identifier fn_update_time =~ "update_time$"; @@ fn_update_time (..., - struct timespec *t, + struct timespec64 *t, ...) { ... } @ depends on patch @ identifier t; @@ lease_get_mtime( ... , - struct timespec *t + struct timespec64 *t ) { ... } @te depends on patch forall@ identifier ts; local idexpression struct inode *inode_node; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn_update_time =~ "update_time$"; identifier fn; expression e, E3; local idexpression struct inode *node1; local idexpression struct inode *node2; local idexpression struct iattr *attr1; local idexpression struct iattr *attr2; local idexpression struct iattr attr; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; @@ ( ( - struct timespec ts; + struct timespec64 ts; | - struct timespec ts = current_time(inode_node); + struct timespec64 ts = current_time(inode_node); ) <+... when != ts ( - timespec_equal(&inode_node->i_xtime, &ts) + timespec64_equal(&inode_node->i_xtime, &ts) | - timespec_equal(&ts, &inode_node->i_xtime) + timespec64_equal(&ts, &inode_node->i_xtime) | - timespec_compare(&inode_node->i_xtime, &ts) + timespec64_compare(&inode_node->i_xtime, &ts) | - timespec_compare(&ts, &inode_node->i_xtime) + timespec64_compare(&ts, &inode_node->i_xtime) | ts = current_time(e) | fn_update_time(..., &ts,...) | inode_node->i_xtime = ts | node1->i_xtime = ts | ts = inode_node->i_xtime | <+... attr1->ia_xtime ...+> = ts | ts = attr1->ia_xtime | ts.tv_sec | ts.tv_nsec | btrfs_set_stack_timespec_sec(..., ts.tv_sec) | btrfs_set_stack_timespec_nsec(..., ts.tv_nsec) | - ts = timespec64_to_timespec( + ts = ... -) | - ts = ktime_to_timespec( + ts = ktime_to_timespec64( ...) | - ts = E3 + ts = timespec_to_timespec64(E3) | - ktime_get_real_ts(&ts) + ktime_get_real_ts64(&ts) | fn(..., - ts + timespec64_to_timespec(ts) ,...) ) ...+> ( <... when != ts - return ts; + return timespec64_to_timespec(ts); ...> ) | - timespec_equal(&node1->i_xtime1, &node2->i_xtime2) + timespec64_equal(&node1->i_xtime2, &node2->i_xtime2) | - timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2) + timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2) | - timespec_compare(&node1->i_xtime1, &node2->i_xtime2) + timespec64_compare(&node1->i_xtime1, &node2->i_xtime2) | node1->i_xtime1 = - timespec_trunc(attr1->ia_xtime1, + timespec64_trunc(attr1->ia_xtime1, ...) | - attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2, + attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2, ...) | - ktime_get_real_ts(&attr1->ia_xtime1) + ktime_get_real_ts64(&attr1->ia_xtime1) | - ktime_get_real_ts(&attr.ia_xtime1) + ktime_get_real_ts64(&attr.ia_xtime1) ) @ depends on patch @ struct inode *node; struct iattr *attr; identifier fn; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; expression e; @@ ( - fn(node->i_xtime); + fn(timespec64_to_timespec(node->i_xtime)); | fn(..., - node->i_xtime); + timespec64_to_timespec(node->i_xtime)); | - e = fn(attr->ia_xtime); + e = fn(timespec64_to_timespec(attr->ia_xtime)); ) @ depends on patch forall @ struct inode *node; struct iattr *attr; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); fn (..., - &attr->ia_xtime, + &ts, ...); ) ...+> } @ depends on patch forall @ struct inode *node; struct iattr *attr; struct kstat *stat; identifier ia_xtime =~ "^ia_[acm]time$"; identifier i_xtime =~ "^i_[acm]time$"; identifier xtime =~ "^[acm]time$"; identifier fn, ret; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime); + &ts); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime); + &ts); | + ts = timespec64_to_timespec(stat->xtime); ret = fn (..., - &stat->xtime); + &ts); ) ...+> } @ depends on patch @ struct inode *node; struct inode *node2; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier i_xtime3 =~ "^i_[acm]time$"; struct iattr *attrp; struct iattr *attrp2; struct iattr attr ; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; struct kstat *stat; struct kstat stat1; struct timespec64 ts; identifier xtime =~ "^[acmb]time$"; expression e; @@ ( ( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ; | node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | stat->xtime = node2->i_xtime1; | stat1.xtime = node2->i_xtime1; | ( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ; | ( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2; | - e = node->i_xtime1; + e = timespec64_to_timespec( node->i_xtime1 ); | - e = attrp->ia_xtime1; + e = timespec64_to_timespec( attrp->ia_xtime1 ); | node->i_xtime1 = current_time(...); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | - node->i_xtime1 = e; + node->i_xtime1 = timespec_to_timespec64(e); ) Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: <anton@tuxera.com> Cc: <balbi@kernel.org> Cc: <bfields@fieldses.org> Cc: <darrick.wong@oracle.com> Cc: <dhowells@redhat.com> Cc: <dsterba@suse.com> Cc: <dwmw2@infradead.org> Cc: <hch@lst.de> Cc: <hirofumi@mail.parknet.co.jp> Cc: <hubcap@omnibond.com> Cc: <jack@suse.com> Cc: <jaegeuk@kernel.org> Cc: <jaharkes@cs.cmu.edu> Cc: <jslaby@suse.com> Cc: <keescook@chromium.org> Cc: <mark@fasheh.com> Cc: <miklos@szeredi.hu> Cc: <nico@linaro.org> Cc: <reiserfs-devel@vger.kernel.org> Cc: <richard@nod.at> Cc: <sage@redhat.com> Cc: <sfrench@samba.org> Cc: <swhiteho@redhat.com> Cc: <tj@kernel.org> Cc: <trond.myklebust@primarydata.com> Cc: <tytso@mit.edu> Cc: <viro@zeniv.linux.org.uk>
2018-05-09 10:36:02 +08:00
struct timespec64 cur_time = current_time(dir);
struct inode *inode;
int ret;
int err;
u64 objectid;
u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
u64 index = 0;
uuid_le new_uuid;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
return -ENOMEM;
ret = btrfs_find_free_objectid(fs_info->tree_root, &objectid);
if (ret)
goto fail_free;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
* screwed up since it assumes subvolume qgroup's level to be 0.
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
goto fail_free;
}
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
Btrfs: proper -ENOSPC handling At the start of a transaction we do a btrfs_reserve_metadata_space() and specify how many items we plan on modifying. Then once we've done our modifications and such, just call btrfs_unreserve_metadata_space() for the same number of items we reserved. For keeping track of metadata needed for data I've had to add an extent_io op for when we merge extents. This lets us track space properly when we are doing sequential writes, so we don't end up reserving way more metadata space than what we need. The only place where the metadata space accounting is not done is in the relocation code. This is because Yan is going to be reworking that code in the near future, so running btrfs-vol -b could still possibly result in a ENOSPC related panic. This patch also turns off the metadata_ratio stuff in order to allow users to more efficiently use their disk space. This patch makes it so we track how much metadata we need for an inode's delayed allocation extents by tracking how many extents are currently waiting for allocation. It introduces two new callbacks for the extent_io tree's, merge_extent_hook and split_extent_hook. These help us keep track of when we merge delalloc extents together and split them up. Reservations are handled prior to any actually dirty'ing occurs, and then we unreserve after we dirty. btrfs_unreserve_metadata_for_delalloc() will make the appropriate unreservations as needed based on the number of reservations we currently have and the number of extents we currently have. Doing the reservation outside of doing any of the actual dirty'ing lets us do things like filemap_flush() the inode to try and force delalloc to happen, or as a last resort actually start allocation on all delalloc inodes in the fs. This has survived dbench, fs_mark and an fsx torture test. Signed-off-by: Josef Bacik <jbacik@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:12:44 +08:00
/*
* The same as the snapshot creation, please see the comment
* of create_snapshot().
Btrfs: proper -ENOSPC handling At the start of a transaction we do a btrfs_reserve_metadata_space() and specify how many items we plan on modifying. Then once we've done our modifications and such, just call btrfs_unreserve_metadata_space() for the same number of items we reserved. For keeping track of metadata needed for data I've had to add an extent_io op for when we merge extents. This lets us track space properly when we are doing sequential writes, so we don't end up reserving way more metadata space than what we need. The only place where the metadata space accounting is not done is in the relocation code. This is because Yan is going to be reworking that code in the near future, so running btrfs-vol -b could still possibly result in a ENOSPC related panic. This patch also turns off the metadata_ratio stuff in order to allow users to more efficiently use their disk space. This patch makes it so we track how much metadata we need for an inode's delayed allocation extents by tracking how many extents are currently waiting for allocation. It introduces two new callbacks for the extent_io tree's, merge_extent_hook and split_extent_hook. These help us keep track of when we merge delalloc extents together and split them up. Reservations are handled prior to any actually dirty'ing occurs, and then we unreserve after we dirty. btrfs_unreserve_metadata_for_delalloc() will make the appropriate unreservations as needed based on the number of reservations we currently have and the number of extents we currently have. Doing the reservation outside of doing any of the actual dirty'ing lets us do things like filemap_flush() the inode to try and force delalloc to happen, or as a last resort actually start allocation on all delalloc inodes in the fs. This has survived dbench, fs_mark and an fsx torture test. Signed-off-by: Josef Bacik <jbacik@redhat.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-09-12 04:12:44 +08:00
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
if (ret)
goto fail_free;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(fs_info, &block_rsv);
goto fail_free;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
goto fail;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
}
btrfs_mark_buffer_dirty(leaf);
inode_item = &root_item->inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item,
fs_info->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_flags(root_item, 0);
btrfs_set_root_limit(root_item, 0);
btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
btrfs_set_root_bytenr(root_item, leaf->start);
btrfs_set_root_generation(root_item, trans->transid);
btrfs_set_root_level(root_item, 0);
btrfs_set_root_refs(root_item, 1);
btrfs_set_root_used(root_item, leaf->len);
btrfs_set_root_last_snapshot(root_item, 0);
btrfs_set_root_generation_v2(root_item,
btrfs_root_generation(root_item));
uuid_le_gen(&new_uuid);
memcpy(root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
root_item->ctime = root_item->otime;
btrfs_set_root_ctransid(root_item, trans->transid);
btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
free_extent_buffer(leaf);
leaf = NULL;
btrfs_set_root_dirid(root_item, new_dirid);
key.objectid = objectid;
Btrfs: Mixed back reference (FORWARD ROLLING FORMAT CHANGE) This commit introduces a new kind of back reference for btrfs metadata. Once a filesystem has been mounted with this commit, IT WILL NO LONGER BE MOUNTABLE BY OLDER KERNELS. When a tree block in subvolume tree is cow'd, the reference counts of all extents it points to are increased by one. At transaction commit time, the old root of the subvolume is recorded in a "dead root" data structure, and the btree it points to is later walked, dropping reference counts and freeing any blocks where the reference count goes to 0. The increments done during cow and decrements done after commit cancel out, and the walk is a very expensive way to go about freeing the blocks that are no longer referenced by the new btree root. This commit reduces the transaction overhead by avoiding the need for dead root records. When a non-shared tree block is cow'd, we free the old block at once, and the new block inherits old block's references. When a tree block with reference count > 1 is cow'd, we increase the reference counts of all extents the new block points to by one, and decrease the old block's reference count by one. This dead tree avoidance code removes the need to modify the reference counts of lower level extents when a non-shared tree block is cow'd. But we still need to update back ref for all pointers in the block. This is because the location of the block is recorded in the back ref item. We can solve this by introducing a new type of back ref. The new back ref provides information about pointer's key, level and in which tree the pointer lives. This information allow us to find the pointer by searching the tree. The shortcoming of the new back ref is that it only works for pointers in tree blocks referenced by their owner trees. This is mostly a problem for snapshots, where resolving one of these fuzzy back references would be O(number_of_snapshots) and quite slow. The solution used here is to use the fuzzy back references in the common case where a given tree block is only referenced by one root, and use the full back references when multiple roots have a reference on a given block. This commit adds per subvolume red-black tree to keep trace of cached inodes. The red-black tree helps the balancing code to find cached inodes whose inode numbers within a given range. This commit improves the balancing code by introducing several data structures to keep the state of balancing. The most important one is the back ref cache. It caches how the upper level tree blocks are referenced. This greatly reduce the overhead of checking back ref. The improved balancing code scales significantly better with a large number of snapshots. This is a very large commit and was written in a number of pieces. But, they depend heavily on the disk format change and were squashed together to make sure git bisect didn't end up in a bad state wrt space balancing or the format change. Signed-off-by: Yan Zheng <zheng.yan@oracle.com> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2009-06-10 22:45:14 +08:00
key.offset = 0;
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
if (ret)
goto fail;
key.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_record_root_in_trans(trans, new_root);
Btrfs: add support for inode properties This change adds infrastructure to allow for generic properties for inodes. Properties are name/value pairs that can be associated with inodes for different purposes. They are stored as xattrs with the prefix "btrfs." Properties can be inherited - this means when a directory inode has inheritable properties set, these are added to new inodes created under that directory. Further, subvolumes can also have properties associated with them, and they can be inherited from their parent subvolume. Naturally, directory properties have priority over subvolume properties (in practice a subvolume property is just a regular property associated with the root inode, objectid 256, of the subvolume's fs tree). This change also adds one specific property implementation, named "compression", whose values can be "lzo" or "zlib" and it's an inheritable property. The corresponding changes to btrfs-progs were also implemented. A patch with xfstests for this feature will follow once there's agreement on this change/feature. Further, the script at the bottom of this commit message was used to do some benchmarks to measure any performance penalties of this feature. Basically the tests correspond to: Test 1 - create a filesystem and mount it with compress-force=lzo, then sequentially create N files of 64Kb each, measure how long it took to create the files, unmount the filesystem, mount the filesystem and perform an 'ls -lha' against the test directory holding the N files, and report the time the command took. Test 2 - create a filesystem and don't use any compression option when mounting it - instead set the compression property of the subvolume's root to 'lzo'. Then create N files of 64Kb, and report the time it took. The unmount the filesystem, mount it again and perform an 'ls -lha' like in the former test. This means every single file ends up with a property (xattr) associated to it. Test 3 - same as test 2, but uses 4 properties - 3 are duplicates of the compression property, have no real effect other than adding more work when inheriting properties and taking more btree leaf space. Test 4 - same as test 3 but with 10 properties per file. Results (in seconds, and averages of 5 runs each), for different N numbers of files follow. * Without properties (test 1) file creation time ls -lha time 10 000 files 3.49 0.76 100 000 files 47.19 8.37 1 000 000 files 518.51 107.06 * With 1 property (compression property set to lzo - test 2) file creation time ls -lha time 10 000 files 3.63 0.93 100 000 files 48.56 9.74 1 000 000 files 537.72 125.11 * With 4 properties (test 3) file creation time ls -lha time 10 000 files 3.94 1.20 100 000 files 52.14 11.48 1 000 000 files 572.70 142.13 * With 10 properties (test 4) file creation time ls -lha time 10 000 files 4.61 1.35 100 000 files 58.86 13.83 1 000 000 files 656.01 177.61 The increased latencies with properties are essencialy because of: *) When creating an inode, we now synchronously write 1 more item (an xattr item) for each property inherited from the parent dir (or subvolume). This could be done in an asynchronous way such as we do for dir intex items (delayed-inode.c), which could help reduce the file creation latency; *) With properties, we now have larger fs trees. For this particular test each xattr item uses 75 bytes of leaf space in the fs tree. This could be less by using a new item for xattr items, instead of the current btrfs_dir_item, since we could cut the 'location' and 'type' fields (saving 18 bytes) and maybe 'transid' too (saving a total of 26 bytes per xattr item) from the btrfs_dir_item type. Also tried batching the xattr insertions (ignoring proper hash collision handling, since it didn't exist) when creating files that inherit properties from their parent inode/subvolume, but the end results were (surprisingly) essentially the same. Test script: $ cat test.pl #!/usr/bin/perl -w use strict; use Time::HiRes qw(time); use constant NUM_FILES => 10_000; use constant FILE_SIZES => (64 * 1024); use constant DEV => '/dev/sdb4'; use constant MNT_POINT => '/home/fdmanana/btrfs-tests/dev'; use constant TEST_DIR => (MNT_POINT . '/testdir'); system("mkfs.btrfs", "-l", "16384", "-f", DEV) == 0 or die "mkfs.btrfs failed!"; # following line for testing without properties #system("mount", "-o", "compress-force=lzo", DEV, MNT_POINT) == 0 or die "mount failed!"; # following 2 lines for testing with properties system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; system("btrfs", "prop", "set", MNT_POINT, "compression", "lzo") == 0 or die "set prop failed!"; system("mkdir", TEST_DIR) == 0 or die "mkdir failed!"; my ($t1, $t2); $t1 = time(); for (my $i = 1; $i <= NUM_FILES; $i++) { my $p = TEST_DIR . '/file_' . $i; open(my $f, '>', $p) or die "Error opening file!"; $f->autoflush(1); for (my $j = 0; $j < FILE_SIZES; $j += 4096) { print $f ('A' x 4096) or die "Error writing to file!"; } close($f); } $t2 = time(); print "Time to create " . NUM_FILES . ": " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; system("mount", DEV, MNT_POINT) == 0 or die "mount failed!"; $t1 = time(); system("bash -c 'ls -lha " . TEST_DIR . " > /dev/null'") == 0 or die "ls failed!"; $t2 = time(); print "Time to ls -lha all files: " . ($t2 - $t1) . " seconds.\n"; system("umount", DEV) == 0 or die "umount failed!"; Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Josef Bacik <jbacik@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-01-07 19:47:46 +08:00
ret = btrfs_create_subvol_root(trans, new_root, root, new_dirid);
if (ret) {
/* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
goto fail;
}
Btrfs: Initialize btrfs_root->highest_objectid when loading tree root and subvolume roots The following call trace is seen when btrfs/031 test is executed in a loop, [ 158.661848] ------------[ cut here ]------------ [ 158.662634] WARNING: CPU: 2 PID: 890 at /home/chandan/repos/linux/fs/btrfs/ioctl.c:558 create_subvol+0x3d1/0x6ea() [ 158.664102] BTRFS: Transaction aborted (error -2) [ 158.664774] Modules linked in: [ 158.665266] CPU: 2 PID: 890 Comm: btrfs Not tainted 4.4.0-rc6-g511711a #2 [ 158.666251] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 [ 158.667392] ffffffff81c0a6b0 ffff8806c7c4f8e8 ffffffff81431fc8 ffff8806c7c4f930 [ 158.668515] ffff8806c7c4f920 ffffffff81051aa1 ffff880c85aff000 ffff8800bb44d000 [ 158.669647] ffff8808863b5c98 0000000000000000 00000000fffffffe ffff8806c7c4f980 [ 158.670769] Call Trace: [ 158.671153] [<ffffffff81431fc8>] dump_stack+0x44/0x5c [ 158.671884] [<ffffffff81051aa1>] warn_slowpath_common+0x81/0xc0 [ 158.672769] [<ffffffff81051b27>] warn_slowpath_fmt+0x47/0x50 [ 158.673620] [<ffffffff813bc98d>] create_subvol+0x3d1/0x6ea [ 158.674440] [<ffffffff813777c9>] btrfs_mksubvol.isra.30+0x369/0x520 [ 158.675376] [<ffffffff8108a4aa>] ? percpu_down_read+0x1a/0x50 [ 158.676235] [<ffffffff81377a81>] btrfs_ioctl_snap_create_transid+0x101/0x180 [ 158.677268] [<ffffffff81377b52>] btrfs_ioctl_snap_create+0x52/0x70 [ 158.678183] [<ffffffff8137afb4>] btrfs_ioctl+0x474/0x2f90 [ 158.678975] [<ffffffff81144b8e>] ? vma_merge+0xee/0x300 [ 158.679751] [<ffffffff8115be31>] ? alloc_pages_vma+0x91/0x170 [ 158.680599] [<ffffffff81123f62>] ? lru_cache_add_active_or_unevictable+0x22/0x70 [ 158.681686] [<ffffffff813d99cf>] ? selinux_file_ioctl+0xff/0x1d0 [ 158.682581] [<ffffffff8117b791>] do_vfs_ioctl+0x2c1/0x490 [ 158.683399] [<ffffffff813d3cde>] ? security_file_ioctl+0x3e/0x60 [ 158.684297] [<ffffffff8117b9d4>] SyS_ioctl+0x74/0x80 [ 158.685051] [<ffffffff819b2bd7>] entry_SYSCALL_64_fastpath+0x12/0x6a [ 158.685958] ---[ end trace 4b63312de5a2cb76 ]--- [ 158.686647] BTRFS: error (device loop0) in create_subvol:558: errno=-2 No such entry [ 158.709508] BTRFS info (device loop0): forced readonly [ 158.737113] BTRFS info (device loop0): disk space caching is enabled [ 158.738096] BTRFS error (device loop0): Remounting read-write after error is not allowed [ 158.851303] BTRFS error (device loop0): cleaner transaction attach returned -30 This occurs because, Mount filesystem Create subvol with ID 257 Unmount filesystem Mount filesystem Delete subvol with ID 257 btrfs_drop_snapshot() Add root corresponding to subvol 257 into btrfs_transaction->dropped_roots list Create new subvol (i.e. create_subvol()) 257 is returned as the next free objectid btrfs_read_fs_root_no_name() Finds the btrfs_root instance corresponding to the old subvol with ID 257 in btrfs_fs_info->fs_roots_radix. Returns error since btrfs_root_item->refs has the value of 0. To fix the issue the commit initializes tree root's and subvolume root's highest_objectid when loading the roots from disk. Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: David Sterba <dsterba@suse.com>
2016-01-07 21:26:59 +08:00
mutex_lock(&new_root->objectid_mutex);
new_root->highest_objectid = new_dirid;
mutex_unlock(&new_root->objectid_mutex);
/*
* insert the directory item
*/
ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
BTRFS_FT_DIR, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret)
btrfs_abort_transaction(trans, ret);
fail:
kfree(root_item);
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(fs_info, &block_rsv);
if (async_transid) {
*async_transid = trans->transid;
err = btrfs_commit_transaction_async(trans, 1);
if (err)
err = btrfs_commit_transaction(trans);
} else {
err = btrfs_commit_transaction(trans);
}
if (err && !ret)
ret = err;
if (!ret) {
inode = btrfs_lookup_dentry(dir, dentry);
if (IS_ERR(inode))
return PTR_ERR(inode);
d_instantiate(dentry, inode);
}
return ret;
fail_free:
kfree(root_item);
return ret;
}
static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct dentry *dentry,
u64 *async_transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced nocow writes to fallback to COW, during writeback, when a snapshot is created. This resulted in writes made before creating the snapshot to unexpectedly fail with ENOSPC during writeback when success (0) was returned to user space through the write system call. The steps leading to this problem are: 1. When it's not possible to allocate data space for a write, the buffered write path checks if a NOCOW write is possible. If it is, it will not reserve space and success (0) is returned to user space. 2. Then when a snapshot is created, the root's will_be_snapshotted atomic is incremented and writeback is triggered for all inode's that belong to the root being snapshotted. Incrementing that atomic forces all previous writes to fallback to COW during writeback (running delalloc). 3. This results in the writeback for the inodes to fail and therefore setting the ENOSPC error in their mappings, so that a subsequent fsync on them will report the error to user space. So it's not a completely silent data loss (since fsync will report ENOSPC) but it's a very unexpected and undesirable behaviour, because if a clean shutdown/unmount of the filesystem happens without previous calls to fsync, it is expected to have the data present in the files after mounting the filesystem again. So fix this by adding a new atomic named snapshot_force_cow to the root structure which prevents this behaviour and works the following way: 1. It is incremented when we start to create a snapshot after triggering writeback and before waiting for writeback to finish. 2. This new atomic is now what is used by writeback (running delalloc) to decide whether we need to fallback to COW or not. Because we incremented this new atomic after triggering writeback in the snapshot creation ioctl, we ensure that all buffered writes that happened before snapshot creation will succeed and not fallback to COW (which would make them fail with ENOSPC). 3. The existing atomic, will_be_snapshotted, is kept because it is used to force new buffered writes, that start after we started snapshotting, to reserve data space even when NOCOW is possible. This makes these writes fail early with ENOSPC when there's no available space to allocate, preventing the unexpected behaviour of writeback later failing with ENOSPC due to a fallback to COW mode. Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") Signed-off-by: Robbie Ko <robbieko@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 10:30:30 +08:00
bool snapshot_force_cow = false;
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
if (atomic_read(&root->nr_swapfiles)) {
btrfs_warn(fs_info,
"cannot snapshot subvolume with active swapfile");
return -ETXTBSY;
}
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
if (!pending_snapshot)
return -ENOMEM;
pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
GFP_KERNEL);
pending_snapshot->path = btrfs_alloc_path();
if (!pending_snapshot->root_item || !pending_snapshot->path) {
ret = -ENOMEM;
goto free_pending;
}
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced nocow writes to fallback to COW, during writeback, when a snapshot is created. This resulted in writes made before creating the snapshot to unexpectedly fail with ENOSPC during writeback when success (0) was returned to user space through the write system call. The steps leading to this problem are: 1. When it's not possible to allocate data space for a write, the buffered write path checks if a NOCOW write is possible. If it is, it will not reserve space and success (0) is returned to user space. 2. Then when a snapshot is created, the root's will_be_snapshotted atomic is incremented and writeback is triggered for all inode's that belong to the root being snapshotted. Incrementing that atomic forces all previous writes to fallback to COW during writeback (running delalloc). 3. This results in the writeback for the inodes to fail and therefore setting the ENOSPC error in their mappings, so that a subsequent fsync on them will report the error to user space. So it's not a completely silent data loss (since fsync will report ENOSPC) but it's a very unexpected and undesirable behaviour, because if a clean shutdown/unmount of the filesystem happens without previous calls to fsync, it is expected to have the data present in the files after mounting the filesystem again. So fix this by adding a new atomic named snapshot_force_cow to the root structure which prevents this behaviour and works the following way: 1. It is incremented when we start to create a snapshot after triggering writeback and before waiting for writeback to finish. 2. This new atomic is now what is used by writeback (running delalloc) to decide whether we need to fallback to COW or not. Because we incremented this new atomic after triggering writeback in the snapshot creation ioctl, we ensure that all buffered writes that happened before snapshot creation will succeed and not fallback to COW (which would make them fail with ENOSPC). 3. The existing atomic, will_be_snapshotted, is kept because it is used to force new buffered writes, that start after we started snapshotting, to reserve data space even when NOCOW is possible. This makes these writes fail early with ENOSPC when there's no available space to allocate, preventing the unexpected behaviour of writeback later failing with ENOSPC due to a fallback to COW mode. Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") Signed-off-by: Robbie Ko <robbieko@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 10:30:30 +08:00
/*
* Force new buffered writes to reserve space even when NOCOW is
* possible. This is to avoid later writeback (running dealloc) to
* fallback to COW mode and unexpectedly fail with ENOSPC.
*/
atomic_inc(&root->will_be_snapshotted);
smp_mb__after_atomic();
/* wait for no snapshot writes */
wait_event(root->subv_writers->wait,
percpu_counter_sum(&root->subv_writers->counter) == 0);
btrfs: use tagged writepage to mitigate livelock of snapshot Snapshot is expected to be fast. But if there are writers steadily creating dirty pages in our subvolume, the snapshot may take a very long time to complete. To fix the problem, we use tagged writepage for snapshot flusher as we do in the generic write_cache_pages(), so we can omit pages dirtied after the snapshot command. This does not change the semantics regarding which data get to the snapshot, if there are pages being dirtied during the snapshotting operation. There's a sync called before snapshot is taken in old/new case, any IO in flight just after that may be in the snapshot but this depends on other system effects that might still sync the IO. We do a simple snapshot speed test on a Intel D-1531 box: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=write --size=64G --direct=0 --thread=1 --numjobs=1 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 1m58sec patched: 6.54sec This is the best case for this patch since for a sequential write case, we omit nearly all pages dirtied after the snapshot command. For a multi writers, random write test: fio --ioengine=libaio --iodepth=32 --bs=4k --rw=randwrite --size=64G --direct=0 --thread=1 --numjobs=4 --time_based --runtime=120 --filename=/mnt/sub/testfile --name=job1 --group_reporting & sleep 5; time btrfs sub snap -r /mnt/sub /mnt/snap; killall fio original: 15.83sec patched: 10.35sec The improvement is smaller compared to the sequential write case, since we omit only half of the pages dirtied after snapshot command. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Ethan Lien <ethanlien@synology.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-01 14:49:03 +08:00
ret = btrfs_start_delalloc_snapshot(root);
if (ret)
goto dec_and_free;
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced nocow writes to fallback to COW, during writeback, when a snapshot is created. This resulted in writes made before creating the snapshot to unexpectedly fail with ENOSPC during writeback when success (0) was returned to user space through the write system call. The steps leading to this problem are: 1. When it's not possible to allocate data space for a write, the buffered write path checks if a NOCOW write is possible. If it is, it will not reserve space and success (0) is returned to user space. 2. Then when a snapshot is created, the root's will_be_snapshotted atomic is incremented and writeback is triggered for all inode's that belong to the root being snapshotted. Incrementing that atomic forces all previous writes to fallback to COW during writeback (running delalloc). 3. This results in the writeback for the inodes to fail and therefore setting the ENOSPC error in their mappings, so that a subsequent fsync on them will report the error to user space. So it's not a completely silent data loss (since fsync will report ENOSPC) but it's a very unexpected and undesirable behaviour, because if a clean shutdown/unmount of the filesystem happens without previous calls to fsync, it is expected to have the data present in the files after mounting the filesystem again. So fix this by adding a new atomic named snapshot_force_cow to the root structure which prevents this behaviour and works the following way: 1. It is incremented when we start to create a snapshot after triggering writeback and before waiting for writeback to finish. 2. This new atomic is now what is used by writeback (running delalloc) to decide whether we need to fallback to COW or not. Because we incremented this new atomic after triggering writeback in the snapshot creation ioctl, we ensure that all buffered writes that happened before snapshot creation will succeed and not fallback to COW (which would make them fail with ENOSPC). 3. The existing atomic, will_be_snapshotted, is kept because it is used to force new buffered writes, that start after we started snapshotting, to reserve data space even when NOCOW is possible. This makes these writes fail early with ENOSPC when there's no available space to allocate, preventing the unexpected behaviour of writeback later failing with ENOSPC due to a fallback to COW mode. Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") Signed-off-by: Robbie Ko <robbieko@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 10:30:30 +08:00
/*
* All previous writes have started writeback in NOCOW mode, so now
* we force future writes to fallback to COW mode during snapshot
* creation.
*/
atomic_inc(&root->snapshot_force_cow);
snapshot_force_cow = true;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
* 1 - parent dir inode
* 2 - dir entries
* 1 - root item
* 2 - root ref/backref
* 1 - root of snapshot
* 1 - UUID item
*/
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv, 8,
false);
if (ret)
goto dec_and_free;
pending_snapshot->dentry = dentry;
pending_snapshot->root = root;
pending_snapshot->readonly = readonly;
pending_snapshot->dir = dir;
pending_snapshot->inherit = inherit;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto fail;
}
spin_lock(&fs_info->trans_lock);
list_add(&pending_snapshot->list,
&trans->transaction->pending_snapshots);
spin_unlock(&fs_info->trans_lock);
if (async_transid) {
*async_transid = trans->transid;
ret = btrfs_commit_transaction_async(trans, 1);
if (ret)
ret = btrfs_commit_transaction(trans);
} else {
ret = btrfs_commit_transaction(trans);
}
if (ret)
goto fail;
ret = pending_snapshot->error;
if (ret)
goto fail;
ret = btrfs_orphan_cleanup(pending_snapshot->snap);
if (ret)
goto fail;
inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto fail;
}
d_instantiate(dentry, inode);
ret = 0;
fail:
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
Btrfs: fix unexpected failure of nocow buffered writes after snapshotting when low on space Commit e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") forced nocow writes to fallback to COW, during writeback, when a snapshot is created. This resulted in writes made before creating the snapshot to unexpectedly fail with ENOSPC during writeback when success (0) was returned to user space through the write system call. The steps leading to this problem are: 1. When it's not possible to allocate data space for a write, the buffered write path checks if a NOCOW write is possible. If it is, it will not reserve space and success (0) is returned to user space. 2. Then when a snapshot is created, the root's will_be_snapshotted atomic is incremented and writeback is triggered for all inode's that belong to the root being snapshotted. Incrementing that atomic forces all previous writes to fallback to COW during writeback (running delalloc). 3. This results in the writeback for the inodes to fail and therefore setting the ENOSPC error in their mappings, so that a subsequent fsync on them will report the error to user space. So it's not a completely silent data loss (since fsync will report ENOSPC) but it's a very unexpected and undesirable behaviour, because if a clean shutdown/unmount of the filesystem happens without previous calls to fsync, it is expected to have the data present in the files after mounting the filesystem again. So fix this by adding a new atomic named snapshot_force_cow to the root structure which prevents this behaviour and works the following way: 1. It is incremented when we start to create a snapshot after triggering writeback and before waiting for writeback to finish. 2. This new atomic is now what is used by writeback (running delalloc) to decide whether we need to fallback to COW or not. Because we incremented this new atomic after triggering writeback in the snapshot creation ioctl, we ensure that all buffered writes that happened before snapshot creation will succeed and not fallback to COW (which would make them fail with ENOSPC). 3. The existing atomic, will_be_snapshotted, is kept because it is used to force new buffered writes, that start after we started snapshotting, to reserve data space even when NOCOW is possible. This makes these writes fail early with ENOSPC when there's no available space to allocate, preventing the unexpected behaviour of writeback later failing with ENOSPC due to a fallback to COW mode. Fixes: e9894fd3e3b3 ("Btrfs: fix snapshot vs nocow writting") Signed-off-by: Robbie Ko <robbieko@synology.com> Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-08-06 10:30:30 +08:00
if (snapshot_force_cow)
atomic_dec(&root->snapshot_force_cow);
if (atomic_dec_and_test(&root->will_be_snapshotted))
wake_up_var(&root->will_be_snapshotted);
free_pending:
kfree(pending_snapshot->root_item);
btrfs_free_path(pending_snapshot->path);
kfree(pending_snapshot);
return ret;
}
/* copy of may_delete in fs/namei.c()
* Check whether we can remove a link victim from directory dir, check
* whether the type of victim is right.
* 1. We can't do it if dir is read-only (done in permission())
* 2. We should have write and exec permissions on dir
* 3. We can't remove anything from append-only dir
* 4. We can't do anything with immutable dir (done in permission())
* 5. If the sticky bit on dir is set we should either
* a. be owner of dir, or
* b. be owner of victim, or
* c. have CAP_FOWNER capability
* 6. If the victim is append-only or immutable we can't do anything with
* links pointing to it.
* 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
* 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
* 9. We can't remove a root or mountpoint.
* 10. We don't allow removal of NFS sillyrenamed files; it's handled by
* nfs_async_unlink().
*/
static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir)
{
int error;
if (d_really_is_negative(victim))
return -ENOENT;
BUG_ON(d_inode(victim->d_parent) != dir);
audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
if (error)
return error;
if (IS_APPEND(dir))
return -EPERM;
if (check_sticky(dir, d_inode(victim)) || IS_APPEND(d_inode(victim)) ||
IS_IMMUTABLE(d_inode(victim)) || IS_SWAPFILE(d_inode(victim)))
return -EPERM;
if (isdir) {
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry) Convert the following where appropriate: (1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry). (2) S_ISREG(dentry->d_inode) to d_is_reg(dentry). (3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more complicated than it appears as some calls should be converted to d_can_lookup() instead. The difference is whether the directory in question is a real dir with a ->lookup op or whether it's a fake dir with a ->d_automount op. In some circumstances, we can subsume checks for dentry->d_inode not being NULL into this, provided we the code isn't in a filesystem that expects d_inode to be NULL if the dirent really *is* negative (ie. if we're going to use d_inode() rather than d_backing_inode() to get the inode pointer). Note that the dentry type field may be set to something other than DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS manages the fall-through from a negative dentry to a lower layer. In such a case, the dentry type of the negative union dentry is set to the same as the type of the lower dentry. However, if you know d_inode is not NULL at the call site, then you can use the d_is_xxx() functions even in a filesystem. There is one further complication: a 0,0 chardev dentry may be labelled DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was intended for special directory entry types that don't have attached inodes. The following perl+coccinelle script was used: use strict; my @callers; open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') || die "Can't grep for S_ISDIR and co. callers"; @callers = <$fd>; close($fd); unless (@callers) { print "No matches\n"; exit(0); } my @cocci = ( '@@', 'expression E;', '@@', '', '- S_ISLNK(E->d_inode->i_mode)', '+ d_is_symlink(E)', '', '@@', 'expression E;', '@@', '', '- S_ISDIR(E->d_inode->i_mode)', '+ d_is_dir(E)', '', '@@', 'expression E;', '@@', '', '- S_ISREG(E->d_inode->i_mode)', '+ d_is_reg(E)' ); my $coccifile = "tmp.sp.cocci"; open($fd, ">$coccifile") || die $coccifile; print($fd "$_\n") || die $coccifile foreach (@cocci); close($fd); foreach my $file (@callers) { chomp $file; print "Processing ", $file, "\n"; system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 || die "spatch failed"; } [AV: overlayfs parts skipped] Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 20:02:35 +08:00
if (!d_is_dir(victim))
return -ENOTDIR;
if (IS_ROOT(victim))
return -EBUSY;
VFS: (Scripted) Convert S_ISLNK/DIR/REG(dentry->d_inode) to d_is_*(dentry) Convert the following where appropriate: (1) S_ISLNK(dentry->d_inode) to d_is_symlink(dentry). (2) S_ISREG(dentry->d_inode) to d_is_reg(dentry). (3) S_ISDIR(dentry->d_inode) to d_is_dir(dentry). This is actually more complicated than it appears as some calls should be converted to d_can_lookup() instead. The difference is whether the directory in question is a real dir with a ->lookup op or whether it's a fake dir with a ->d_automount op. In some circumstances, we can subsume checks for dentry->d_inode not being NULL into this, provided we the code isn't in a filesystem that expects d_inode to be NULL if the dirent really *is* negative (ie. if we're going to use d_inode() rather than d_backing_inode() to get the inode pointer). Note that the dentry type field may be set to something other than DCACHE_MISS_TYPE when d_inode is NULL in the case of unionmount, where the VFS manages the fall-through from a negative dentry to a lower layer. In such a case, the dentry type of the negative union dentry is set to the same as the type of the lower dentry. However, if you know d_inode is not NULL at the call site, then you can use the d_is_xxx() functions even in a filesystem. There is one further complication: a 0,0 chardev dentry may be labelled DCACHE_WHITEOUT_TYPE rather than DCACHE_SPECIAL_TYPE. Strictly, this was intended for special directory entry types that don't have attached inodes. The following perl+coccinelle script was used: use strict; my @callers; open($fd, 'git grep -l \'S_IS[A-Z].*->d_inode\' |') || die "Can't grep for S_ISDIR and co. callers"; @callers = <$fd>; close($fd); unless (@callers) { print "No matches\n"; exit(0); } my @cocci = ( '@@', 'expression E;', '@@', '', '- S_ISLNK(E->d_inode->i_mode)', '+ d_is_symlink(E)', '', '@@', 'expression E;', '@@', '', '- S_ISDIR(E->d_inode->i_mode)', '+ d_is_dir(E)', '', '@@', 'expression E;', '@@', '', '- S_ISREG(E->d_inode->i_mode)', '+ d_is_reg(E)' ); my $coccifile = "tmp.sp.cocci"; open($fd, ">$coccifile") || die $coccifile; print($fd "$_\n") || die $coccifile foreach (@cocci); close($fd); foreach my $file (@callers) { chomp $file; print "Processing ", $file, "\n"; system("spatch", "--sp-file", $coccifile, $file, "--in-place", "--no-show-diff") == 0 || die "spatch failed"; } [AV: overlayfs parts skipped] Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2015-01-29 20:02:35 +08:00
} else if (d_is_dir(victim))
return -EISDIR;
if (IS_DEADDIR(dir))
return -ENOENT;
if (victim->d_flags & DCACHE_NFSFS_RENAMED)
return -EBUSY;
return 0;
}
/* copy of may_create in fs/namei.c() */
static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
return inode_permission(dir, MAY_WRITE | MAY_EXEC);
}
/*
* Create a new subvolume below @parent. This is largely modeled after
* sys_mkdirat and vfs_mkdir, but we only do a single component lookup
* inside this filesystem so it's quite a bit simpler.
*/
static noinline int btrfs_mksubvol(const struct path *parent,
const char *name, int namelen,
struct btrfs_root *snap_src,
u64 *async_transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
struct inode *dir = d_inode(parent->dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct dentry *dentry;
int error;
error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (error == -EINTR)
return error;
dentry = lookup_one_len(name, parent->dentry, namelen);
error = PTR_ERR(dentry);
if (IS_ERR(dentry))
goto out_unlock;
error = btrfs_may_create(dir, dentry);
if (error)
goto out_dput;
/*
* even if this name doesn't exist, we may get hash collisions.
* check for them now when we can safely fail
*/
error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
dir->i_ino, name,
namelen);
if (error)
goto out_dput;
down_read(&fs_info->subvol_sem);
if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
goto out_up_read;
if (snap_src) {
error = create_snapshot(snap_src, dir, dentry,
async_transid, readonly, inherit);
} else {
error = create_subvol(dir, dentry, name, namelen,
async_transid, inherit);
}
if (!error)
fsnotify_mkdir(dir, dentry);
out_up_read:
up_read(&fs_info->subvol_sem);
out_dput:
dput(dentry);
out_unlock:
inode_unlock(dir);
return error;
}
/*
* When we're defragging a range, we don't want to kick it off again
* if it is really just waiting for delalloc to send it down.
* If we find a nice big extent or delalloc range for the bytes in the
* file you want to defrag, we return 0 to let you know to skip this
* part of the file
*/
static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh)
{
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em = NULL;
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
u64 end;
read_lock(&em_tree->lock);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
em = lookup_extent_mapping(em_tree, offset, PAGE_SIZE);
read_unlock(&em_tree->lock);
if (em) {
end = extent_map_end(em);
free_extent_map(em);
if (end - offset > thresh)
return 0;
}
/* if we already have a nice delalloc here, just stop */
thresh /= 2;
end = count_range_bits(io_tree, &offset, offset + thresh,
thresh, EXTENT_DELALLOC, 1);
if (end >= thresh)
return 0;
return 1;
}
/*
* helper function to walk through a file and find extents
* newer than a specific transid, and smaller than thresh.
*
* This is used by the defragging code to find new and small
* extents
*/
static int find_new_extents(struct btrfs_root *root,
struct inode *inode, u64 newer_than,
u64 *off, u32 thresh)
{
struct btrfs_path *path;
struct btrfs_key min_key;
struct extent_buffer *leaf;
struct btrfs_file_extent_item *extent;
int type;
int ret;
u64 ino = btrfs_ino(BTRFS_I(inode));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
min_key.objectid = ino;
min_key.type = BTRFS_EXTENT_DATA_KEY;
min_key.offset = *off;
while (1) {
ret = btrfs_search_forward(root, &min_key, path, newer_than);
if (ret != 0)
goto none;
process_slot:
if (min_key.objectid != ino)
goto none;
if (min_key.type != BTRFS_EXTENT_DATA_KEY)
goto none;
leaf = path->nodes[0];
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG &&
btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
check_defrag_in_cache(inode, min_key.offset, thresh)) {
*off = min_key.offset;
btrfs_free_path(path);
return 0;
}
path->slots[0]++;
if (path->slots[0] < btrfs_header_nritems(leaf)) {
btrfs_item_key_to_cpu(leaf, &min_key, path->slots[0]);
goto process_slot;
}
if (min_key.offset == (u64)-1)
goto none;
min_key.offset++;
btrfs_release_path(path);
}
none:
btrfs_free_path(path);
return -ENOENT;
}
static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
{
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_map *em;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
u64 len = PAGE_SIZE;
/*
* hopefully we have this extent in the tree already, try without
* the full extent lock
*/
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
if (!em) {
struct extent_state *cached = NULL;
u64 end = start + len - 1;
/* get the big lock and read metadata off disk */
lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
unlock_extent_cached(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
}
return em;
}
static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
{
struct extent_map *next;
bool ret = true;
/* this is the last extent */
if (em->start + em->len >= i_size_read(inode))
return false;
next = defrag_lookup_extent(inode, em->start + em->len);
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
ret = false;
else if ((em->block_start + em->block_len == next->block_start) &&
(em->block_len > SZ_128K && next->block_len > SZ_128K))
ret = false;
free_extent_map(next);
return ret;
}
static int should_defrag_range(struct inode *inode, u64 start, u32 thresh,
u64 *last_len, u64 *skip, u64 *defrag_end,
int compress)
{
struct extent_map *em;
int ret = 1;
bool next_mergeable = true;
bool prev_mergeable = true;
/*
* make sure that once we start defragging an extent, we keep on
* defragging it
*/
if (start < *defrag_end)
return 1;
*skip = 0;
em = defrag_lookup_extent(inode, start);
if (!em)
return 0;
/* this will cover holes, and inline extents */
if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
ret = 0;
goto out;
}
if (!*defrag_end)
prev_mergeable = false;
next_mergeable = defrag_check_next_extent(inode, em);
/*
* we hit a real extent, if it is big or the next extent is not a
* real extent, don't bother defragging it
*/
if (!compress && (*last_len == 0 || *last_len >= thresh) &&
(em->len >= thresh || (!next_mergeable && !prev_mergeable)))
ret = 0;
out:
/*
* last_len ends up being a counter of how many bytes we've defragged.
* every time we choose not to defrag an extent, we reset *last_len
* so that the next tiny extent will force a defrag.
*
* The end result of this is that tiny extents before a single big
* extent will force at least part of that big extent to be defragged.
*/
if (ret) {
*defrag_end = extent_map_end(em);
} else {
*last_len = 0;
*skip = extent_map_end(em);
*defrag_end = 0;
}
free_extent_map(em);
return ret;
}
/*
* it doesn't do much good to defrag one or two pages
* at a time. This pulls in a nice chunk of pages
* to COW and defrag.
*
* It also makes sure the delalloc code has enough
* dirty data to avoid making new small extents as part
* of the defrag
*
* It's a good idea to start RA on this range
* before calling this.
*/
static int cluster_pages_for_defrag(struct inode *inode,
struct page **pages,
unsigned long start_index,
unsigned long num_pages)
{
unsigned long file_end;
u64 isize = i_size_read(inode);
u64 page_start;
u64 page_end;
u64 page_cnt;
int ret;
int i;
int i_done;
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_io_tree *tree;
struct extent_changeset *data_reserved = NULL;
gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
file_end = (isize - 1) >> PAGE_SHIFT;
if (!isize || start_index > file_end)
return 0;
page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT);
if (ret)
return ret;
i_done = 0;
tree = &BTRFS_I(inode)->io_tree;
/* step one, lock all the pages */
for (i = 0; i < page_cnt; i++) {
struct page *page;
again:
page = find_or_create_page(inode->i_mapping,
start_index + i, mask);
if (!page)
break;
page_start = page_offset(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
page_end = page_start + PAGE_SIZE - 1;
while (1) {
lock_extent_bits(tree, page_start, page_end,
&cached_state);
ordered = btrfs_lookup_ordered_extent(inode,
page_start);
unlock_extent_cached(tree, page_start, page_end,
&cached_state);
if (!ordered)
break;
unlock_page(page);
btrfs_start_ordered_extent(inode, ordered, 1);
btrfs_put_ordered_extent(ordered);
lock_page(page);
/*
* we unlocked the page above, so we need check if
* it was released or not.
*/
if (page->mapping != inode->i_mapping) {
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
goto again;
}
}
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
ret = -EIO;
break;
}
}
if (page->mapping != inode->i_mapping) {
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
goto again;
}
pages[i] = page;
i_done++;
}
if (!i_done || ret)
goto out;
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-28 05:05:09 +08:00
if (!(inode->i_sb->s_flags & SB_ACTIVE))
goto out;
/*
* so now we have a nice long stream of locked
* and up to date pages, lets wait on them
*/
for (i = 0; i < i_done; i++)
wait_on_page_writeback(pages[i]);
page_start = page_offset(pages[0]);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
page_end = page_offset(pages[i_done - 1]) + PAGE_SIZE;
lock_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state);
clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
page_end - 1, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, &cached_state);
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges [BUG] For the following case, btrfs can underflow qgroup reserved space at an error path: (Page size 4K, function name without "btrfs_" prefix) Task A | Task B ---------------------------------------------------------------------- Buffered_write [0, 2K) | |- check_data_free_space() | | |- qgroup_reserve_data() | | Range aligned to page | | range [0, 4K) <<< | | 4K bytes reserved <<< | |- copy pages to page cache | | Buffered_write [2K, 4K) | |- check_data_free_space() | | |- qgroup_reserved_data() | | Range alinged to page | | range [0, 4K) | | Already reserved by A <<< | | 0 bytes reserved <<< | |- delalloc_reserve_metadata() | | And it *FAILED* (Maybe EQUOTA) | |- free_reserved_data_space() |- qgroup_free_data() Range aligned to page range [0, 4K) Freeing 4K (Special thanks to Chandan for the detailed report and analyse) [CAUSE] Above Task B is freeing reserved data range [0, 4K) which is actually reserved by Task A. And at writeback time, page dirty by Task A will go through writeback routine, which will free 4K reserved data space at file extent insert time, causing the qgroup underflow. [FIX] For btrfs_qgroup_free_data(), add @reserved parameter to only free data ranges reserved by previous btrfs_qgroup_reserve_data(). So in above case, Task B will try to free 0 byte, so no underflow. Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
btrfs_delalloc_release_space(inode, data_reserved,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
start_index << PAGE_SHIFT,
btrfs: qgroup: Use separate meta reservation type for delalloc Before this patch, btrfs qgroup is mixing per-transcation meta rsv with preallocated meta rsv, making it quite easy to underflow qgroup meta reservation. Since we have the new qgroup meta rsv types, apply it to delalloc reservation. Now for delalloc, most of its reserved space will use META_PREALLOC qgroup rsv type. And for callers reducing outstanding extent like btrfs_finish_ordered_io(), they will convert corresponding META_PREALLOC reservation to META_PERTRANS. This is mainly due to the fact that current qgroup numbers will only be updated in btrfs_commit_transaction(), that's to say if we don't keep such placeholder reservation, we can exceed qgroup limitation. And for callers freeing outstanding extent in error handler, we will just free META_PREALLOC bytes. This behavior makes callers of btrfs_qgroup_release_meta() or btrfs_qgroup_convert_meta() to be aware of which type they are. So in this patch, btrfs_delalloc_release_metadata() and its callers get an extra parameter to info qgroup to do correct meta convert/release. The good news is, even we use the wrong type (convert or free), it won't cause obvious bug, as prealloc type is always in good shape, and the type only affects how per-trans meta is increased or not. So the worst case will be at most metadata limitation can be sometimes exceeded (no convert at all) or metadata limitation is reached too soon (no free at all). Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:32 +08:00
(page_cnt - i_done) << PAGE_SHIFT, true);
}
set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
&cached_state);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
page_start, page_end - 1, &cached_state);
for (i = 0; i < i_done; i++) {
clear_page_dirty_for_io(pages[i]);
ClearPageChecked(pages[i]);
set_page_extent_mapped(pages[i]);
set_page_dirty(pages[i]);
unlock_page(pages[i]);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(pages[i]);
}
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents() [Background] Btrfs qgroup uses two types of reserved space for METADATA space, PERTRANS and PREALLOC. PERTRANS is metadata space reserved for each transaction started by btrfs_start_transaction(). While PREALLOC is for delalloc, where we reserve space before joining a transaction, and finally it will be converted to PERTRANS after the writeback is done. [Inconsistency] However there is inconsistency in how we handle PREALLOC metadata space. The most obvious one is: In btrfs_buffered_write(): btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true); We always free qgroup PREALLOC meta space. While in btrfs_truncate_block(): btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); We only free qgroup PREALLOC meta space when something went wrong. [The Correct Behavior] The correct behavior should be the one in btrfs_buffered_write(), we should always free PREALLOC metadata space. The reason is, the btrfs_delalloc_* mechanism works by: - Reserve metadata first, even it's not necessary In btrfs_delalloc_reserve_metadata() - Free the unused metadata space Normally in: btrfs_delalloc_release_extents() |- btrfs_inode_rsv_release() Here we do calculation on whether we should release or not. E.g. for 64K buffered write, the metadata rsv works like: /* The first page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=0 total: num_bytes=calc_inode_reservations() /* The first page caused one outstanding extent, thus needs metadata rsv */ /* The 2nd page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed /* The 2nd page doesn't cause new outstanding extent, needs no new meta rsv, so we free what we have reserved */ /* The 3rd~16th pages */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed (still space for one outstanding extent) This means, if btrfs_delalloc_release_extents() determines to free some space, then those space should be freed NOW. So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other than btrfs_qgroup_convert_reserved_meta(). The good news is: - The callers are not that hot The hottest caller is in btrfs_buffered_write(), which is already fixed by commit 336a8bb8e36a ("btrfs: Fix wrong btrfs_delalloc_release_extents parameter"). Thus it's not that easy to cause false EDQUOT. - The trans commit in advance for qgroup would hide the bug Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction commit more aggressive"), when btrfs qgroup metadata free space is slow, it will try to commit transaction and free the wrongly converted PERTRANS space, so it's not that easy to hit such bug. [FIX] So to fix the problem, remove the @qgroup_free parameter for btrfs_delalloc_release_extents(), and always pass true to btrfs_inode_rsv_release(). Reported-by: Filipe Manana <fdmanana@suse.com> Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 14:34:51 +08:00
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
out:
for (i = 0; i < i_done; i++) {
unlock_page(pages[i]);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(pages[i]);
}
btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges [BUG] For the following case, btrfs can underflow qgroup reserved space at an error path: (Page size 4K, function name without "btrfs_" prefix) Task A | Task B ---------------------------------------------------------------------- Buffered_write [0, 2K) | |- check_data_free_space() | | |- qgroup_reserve_data() | | Range aligned to page | | range [0, 4K) <<< | | 4K bytes reserved <<< | |- copy pages to page cache | | Buffered_write [2K, 4K) | |- check_data_free_space() | | |- qgroup_reserved_data() | | Range alinged to page | | range [0, 4K) | | Already reserved by A <<< | | 0 bytes reserved <<< | |- delalloc_reserve_metadata() | | And it *FAILED* (Maybe EQUOTA) | |- free_reserved_data_space() |- qgroup_free_data() Range aligned to page range [0, 4K) Freeing 4K (Special thanks to Chandan for the detailed report and analyse) [CAUSE] Above Task B is freeing reserved data range [0, 4K) which is actually reserved by Task A. And at writeback time, page dirty by Task A will go through writeback routine, which will free 4K reserved data space at file extent insert time, causing the qgroup underflow. [FIX] For btrfs_qgroup_free_data(), add @reserved parameter to only free data ranges reserved by previous btrfs_qgroup_reserve_data(). So in above case, Task B will try to free 0 byte, so no underflow. Reported-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: Qu Wenruo <quwenruo@cn.fujitsu.com> Reviewed-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Tested-by: Chandan Rajendra <chandan@linux.vnet.ibm.com> Signed-off-by: David Sterba <dsterba@suse.com>
2017-02-27 15:10:39 +08:00
btrfs_delalloc_release_space(inode, data_reserved,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
start_index << PAGE_SHIFT,
btrfs: qgroup: Use separate meta reservation type for delalloc Before this patch, btrfs qgroup is mixing per-transcation meta rsv with preallocated meta rsv, making it quite easy to underflow qgroup meta reservation. Since we have the new qgroup meta rsv types, apply it to delalloc reservation. Now for delalloc, most of its reserved space will use META_PREALLOC qgroup rsv type. And for callers reducing outstanding extent like btrfs_finish_ordered_io(), they will convert corresponding META_PREALLOC reservation to META_PERTRANS. This is mainly due to the fact that current qgroup numbers will only be updated in btrfs_commit_transaction(), that's to say if we don't keep such placeholder reservation, we can exceed qgroup limitation. And for callers freeing outstanding extent in error handler, we will just free META_PREALLOC bytes. This behavior makes callers of btrfs_qgroup_release_meta() or btrfs_qgroup_convert_meta() to be aware of which type they are. So in this patch, btrfs_delalloc_release_metadata() and its callers get an extra parameter to info qgroup to do correct meta convert/release. The good news is, even we use the wrong type (convert or free), it won't cause obvious bug, as prealloc type is always in good shape, and the type only affects how per-trans meta is increased or not. So the worst case will be at most metadata limitation can be sometimes exceeded (no convert at all) or metadata limitation is reached too soon (no free at all). Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2017-12-12 15:34:32 +08:00
page_cnt << PAGE_SHIFT, true);
btrfs: qgroup: Always free PREALLOC META reserve in btrfs_delalloc_release_extents() [Background] Btrfs qgroup uses two types of reserved space for METADATA space, PERTRANS and PREALLOC. PERTRANS is metadata space reserved for each transaction started by btrfs_start_transaction(). While PREALLOC is for delalloc, where we reserve space before joining a transaction, and finally it will be converted to PERTRANS after the writeback is done. [Inconsistency] However there is inconsistency in how we handle PREALLOC metadata space. The most obvious one is: In btrfs_buffered_write(): btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes, true); We always free qgroup PREALLOC meta space. While in btrfs_truncate_block(): btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0)); We only free qgroup PREALLOC meta space when something went wrong. [The Correct Behavior] The correct behavior should be the one in btrfs_buffered_write(), we should always free PREALLOC metadata space. The reason is, the btrfs_delalloc_* mechanism works by: - Reserve metadata first, even it's not necessary In btrfs_delalloc_reserve_metadata() - Free the unused metadata space Normally in: btrfs_delalloc_release_extents() |- btrfs_inode_rsv_release() Here we do calculation on whether we should release or not. E.g. for 64K buffered write, the metadata rsv works like: /* The first page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=0 total: num_bytes=calc_inode_reservations() /* The first page caused one outstanding extent, thus needs metadata rsv */ /* The 2nd page */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed /* The 2nd page doesn't cause new outstanding extent, needs no new meta rsv, so we free what we have reserved */ /* The 3rd~16th pages */ reserve_meta: num_bytes=calc_inode_reservations() free_meta: num_bytes=calc_inode_reservations() total: not changed (still space for one outstanding extent) This means, if btrfs_delalloc_release_extents() determines to free some space, then those space should be freed NOW. So for qgroup, we should call btrfs_qgroup_free_meta_prealloc() other than btrfs_qgroup_convert_reserved_meta(). The good news is: - The callers are not that hot The hottest caller is in btrfs_buffered_write(), which is already fixed by commit 336a8bb8e36a ("btrfs: Fix wrong btrfs_delalloc_release_extents parameter"). Thus it's not that easy to cause false EDQUOT. - The trans commit in advance for qgroup would hide the bug Since commit f5fef4593653 ("btrfs: qgroup: Make qgroup async transaction commit more aggressive"), when btrfs qgroup metadata free space is slow, it will try to commit transaction and free the wrongly converted PERTRANS space, so it's not that easy to hit such bug. [FIX] So to fix the problem, remove the @qgroup_free parameter for btrfs_delalloc_release_extents(), and always pass true to btrfs_inode_rsv_release(). Reported-by: Filipe Manana <fdmanana@suse.com> Fixes: 43b18595d660 ("btrfs: qgroup: Use separate meta reservation type for delalloc") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-10-14 14:34:51 +08:00
btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
}
int btrfs_defrag_file(struct inode *inode, struct file *file,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct file_ra_state *ra = NULL;
unsigned long last_index;
u64 isize = i_size_read(inode);
u64 last_len = 0;
u64 skip = 0;
u64 defrag_end = 0;
u64 newer_off = range->start;
unsigned long i;
unsigned long ra_index = 0;
int ret;
int defrag_count = 0;
int compress_type = BTRFS_COMPRESS_ZLIB;
u32 extent_thresh = range->extent_thresh;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
unsigned long max_cluster = SZ_256K >> PAGE_SHIFT;
unsigned long cluster = max_cluster;
u64 new_align = ~((u64)SZ_128K - 1);
struct page **pages = NULL;
bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
if (isize == 0)
return 0;
if (range->start >= isize)
return -EINVAL;
if (do_compress) {
if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
return -EINVAL;
if (range->compress_type)
compress_type = range->compress_type;
}
if (extent_thresh == 0)
extent_thresh = SZ_256K;
/*
* If we were not given a file, allocate a readahead context. As
* readahead is just an optimization, defrag will work without it so
* we don't error out.
*/
if (!file) {
ra = kzalloc(sizeof(*ra), GFP_KERNEL);
if (ra)
file_ra_state_init(ra, inode->i_mapping);
} else {
ra = &file->f_ra;
}
pages = kmalloc_array(max_cluster, sizeof(struct page *), GFP_KERNEL);
if (!pages) {
ret = -ENOMEM;
goto out_ra;
}
/* find the last page to defrag */
if (range->start + range->len > range->start) {
last_index = min_t(u64, isize - 1,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
range->start + range->len - 1) >> PAGE_SHIFT;
} else {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
last_index = (isize - 1) >> PAGE_SHIFT;
}
if (newer_than) {
ret = find_new_extents(root, inode, newer_than,
&newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
/*
* we always align our defrag to help keep
* the extents in the file evenly spaced
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
i = (newer_off & new_align) >> PAGE_SHIFT;
} else
goto out_ra;
} else {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
i = range->start >> PAGE_SHIFT;
}
if (!max_to_defrag)
max_to_defrag = last_index - i + 1;
/*
* make writeback starts from i, so the defrag range can be
* written sequentially.
*/
if (i < inode->i_mapping->writeback_index)
inode->i_mapping->writeback_index = i;
while (i <= last_index && defrag_count < max_to_defrag &&
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
(i < DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE))) {
/*
* make sure we stop running if someone unmounts
* the FS
*/
Rename superblock flags (MS_xyz -> SB_xyz) This is a pure automated search-and-replace of the internal kernel superblock flags. The s_flags are now called SB_*, with the names and the values for the moment mirroring the MS_* flags that they're equivalent to. Note how the MS_xyz flags are the ones passed to the mount system call, while the SB_xyz flags are what we then use in sb->s_flags. The script to do this was: # places to look in; re security/*: it generally should *not* be # touched (that stuff parses mount(2) arguments directly), but # there are two places where we really deal with superblock flags. FILES="drivers/mtd drivers/staging/lustre fs ipc mm \ include/linux/fs.h include/uapi/linux/bfs_fs.h \ security/apparmor/apparmorfs.c security/apparmor/include/lib.h" # the list of MS_... constants SYMS="RDONLY NOSUID NODEV NOEXEC SYNCHRONOUS REMOUNT MANDLOCK \ DIRSYNC NOATIME NODIRATIME BIND MOVE REC VERBOSE SILENT \ POSIXACL UNBINDABLE PRIVATE SLAVE SHARED RELATIME KERNMOUNT \ I_VERSION STRICTATIME LAZYTIME SUBMOUNT NOREMOTELOCK NOSEC BORN \ ACTIVE NOUSER" SED_PROG= for i in $SYMS; do SED_PROG="$SED_PROG -e s/MS_$i/SB_$i/g"; done # we want files that contain at least one of MS_..., # with fs/namespace.c and fs/pnode.c excluded. L=$(for i in $SYMS; do git grep -w -l MS_$i $FILES; done| sort|uniq|grep -v '^fs/namespace.c'|grep -v '^fs/pnode.c') for f in $L; do sed -i $f $SED_PROG; done Requested-by: Al Viro <viro@zeniv.linux.org.uk> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-11-28 05:05:09 +08:00
if (!(inode->i_sb->s_flags & SB_ACTIVE))
break;
if (btrfs_defrag_cancelled(fs_info)) {
btrfs_debug(fs_info, "defrag_file cancelled");
ret = -EAGAIN;
break;
}
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (!should_defrag_range(inode, (u64)i << PAGE_SHIFT,
extent_thresh, &last_len, &skip,
&defrag_end, do_compress)){
unsigned long next;
/*
* the should_defrag function tells us how much to skip
* bump our counter by the suggested amount
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
next = DIV_ROUND_UP(skip, PAGE_SIZE);
i = max(i + 1, next);
continue;
}
if (!newer_than) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
cluster = (PAGE_ALIGN(defrag_end) >>
PAGE_SHIFT) - i;
cluster = min(cluster, max_cluster);
} else {
cluster = max_cluster;
}
if (i + cluster > ra_index) {
ra_index = max(i, ra_index);
if (ra)
page_cache_sync_readahead(inode->i_mapping, ra,
file, ra_index, cluster);
ra_index += cluster;
}
inode_lock(inode);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
} else {
if (do_compress)
BTRFS_I(inode)->defrag_compress = compress_type;
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
}
if (ret < 0) {
inode_unlock(inode);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
inode_unlock(inode);
if (newer_than) {
if (newer_off == (u64)-1)
break;
if (ret > 0)
i += ret;
newer_off = max(newer_off + 1,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
(u64)i << PAGE_SHIFT);
ret = find_new_extents(root, inode, newer_than,
&newer_off, SZ_64K);
if (!ret) {
range->start = newer_off;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
i = (newer_off & new_align) >> PAGE_SHIFT;
} else {
break;
}
} else {
if (ret > 0) {
i += ret;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
last_len += ret << PAGE_SHIFT;
} else {
i++;
last_len = 0;
}
}
}
if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) {
filemap_flush(inode->i_mapping);
if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
&BTRFS_I(inode)->runtime_flags))
filemap_flush(inode->i_mapping);
}
if (range->compress_type == BTRFS_COMPRESS_LZO) {
btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
btrfs: Add zstd support Add zstd compression and decompression support to BtrFS. zstd at its fastest level compresses almost as well as zlib, while offering much faster compression and decompression, approaching lzo speeds. I benchmarked btrfs with zstd compression against no compression, lzo compression, and zlib compression. I benchmarked two scenarios. Copying a set of files to btrfs, and then reading the files. Copying a tarball to btrfs, extracting it to btrfs, and then reading the extracted files. After every operation, I call `sync` and include the sync time. Between every pair of operations I unmount and remount the filesystem to avoid caching. The benchmark files can be found in the upstream zstd source repository under `contrib/linux-kernel/{btrfs-benchmark.sh,btrfs-extract-benchmark.sh}` [1] [2]. I ran the benchmarks on a Ubuntu 14.04 VM with 2 cores and 4 GiB of RAM. The VM is running on a MacBook Pro with a 3.1 GHz Intel Core i7 processor, 16 GB of RAM, and a SSD. The first compression benchmark is copying 10 copies of the unzipped Silesia corpus [3] into a BtrFS filesystem mounted with `-o compress-force=Method`. The decompression benchmark times how long it takes to `tar` all 10 copies into `/dev/null`. The compression ratio is measured by comparing the output of `df` and `du`. See the benchmark file [1] for details. I benchmarked multiple zstd compression levels, although the patch uses zstd level 1. | Method | Ratio | Compression MB/s | Decompression speed | |---------|-------|------------------|---------------------| | None | 0.99 | 504 | 686 | | lzo | 1.66 | 398 | 442 | | zlib | 2.58 | 65 | 241 | | zstd 1 | 2.57 | 260 | 383 | | zstd 3 | 2.71 | 174 | 408 | | zstd 6 | 2.87 | 70 | 398 | | zstd 9 | 2.92 | 43 | 406 | | zstd 12 | 2.93 | 21 | 408 | | zstd 15 | 3.01 | 11 | 354 | The next benchmark first copies `linux-4.11.6.tar` [4] to btrfs. Then it measures the compression ratio, extracts the tar, and deletes the tar. Then it measures the compression ratio again, and `tar`s the extracted files into `/dev/null`. See the benchmark file [2] for details. | Method | Tar Ratio | Extract Ratio | Copy (s) | Extract (s)| Read (s) | |--------|-----------|---------------|----------|------------|----------| | None | 0.97 | 0.78 | 0.981 | 5.501 | 8.807 | | lzo | 2.06 | 1.38 | 1.631 | 8.458 | 8.585 | | zlib | 3.40 | 1.86 | 7.750 | 21.544 | 11.744 | | zstd 1 | 3.57 | 1.85 | 2.579 | 11.479 | 9.389 | [1] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-benchmark.sh [2] https://github.com/facebook/zstd/blob/dev/contrib/linux-kernel/btrfs-extract-benchmark.sh [3] http://sun.aei.polsl.pl/~sdeor/index.php?page=silesia [4] https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-4.11.6.tar.xz zstd source repository: https://github.com/facebook/zstd Signed-off-by: Nick Terrell <terrelln@fb.com> Signed-off-by: Chris Mason <clm@fb.com>
2017-08-10 10:39:02 +08:00
} else if (range->compress_type == BTRFS_COMPRESS_ZSTD) {
btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
}
ret = defrag_count;
out_ra:
if (do_compress) {
inode_lock(inode);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
inode_unlock(inode);
}
if (!file)
kfree(ra);
kfree(pages);
return ret;
}
static noinline int btrfs_ioctl_resize(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
u64 new_size;
u64 old_size;
u64 devid = 1;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_vol_args *vol_args;
struct btrfs_trans_handle *trans;
struct btrfs_device *device = NULL;
char *sizestr;
char *retptr;
char *devstr = NULL;
int ret = 0;
int mod = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mnt_drop_write_file(file);
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
}
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
goto out;
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
sizestr = vol_args->name;
devstr = strchr(sizestr, ':');
if (devstr) {
sizestr = devstr + 1;
*devstr = '\0';
devstr = vol_args->name;
ret = kstrtoull(devstr, 10, &devid);
if (ret)
goto out_free;
if (!devid) {
ret = -EINVAL;
goto out_free;
}
btrfs_info(fs_info, "resizing devid %llu", devid);
}
device = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!device) {
btrfs_info(fs_info, "resizer unable to find device %llu",
devid);
ret = -ENODEV;
goto out_free;
}
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
btrfs_info(fs_info,
"resizer unable to apply on readonly device %llu",
devid);
ret = -EPERM;
goto out_free;
}
if (!strcmp(sizestr, "max"))
new_size = device->bdev->bd_inode->i_size;
else {
if (sizestr[0] == '-') {
mod = -1;
sizestr++;
} else if (sizestr[0] == '+') {
mod = 1;
sizestr++;
}
new_size = memparse(sizestr, &retptr);
if (*retptr != '\0' || new_size == 0) {
ret = -EINVAL;
goto out_free;
}
}
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = -EPERM;
goto out_free;
}
old_size = btrfs_device_get_total_bytes(device);
if (mod < 0) {
if (new_size > old_size) {
ret = -EINVAL;
goto out_free;
}
new_size = old_size - new_size;
} else if (mod > 0) {
if (new_size > ULLONG_MAX - old_size) {
ret = -ERANGE;
goto out_free;
}
new_size = old_size + new_size;
}
if (new_size < SZ_256M) {
ret = -EINVAL;
goto out_free;
}
if (new_size > device->bdev->bd_inode->i_size) {
ret = -EFBIG;
goto out_free;
}
new_size = round_down(new_size, fs_info->sectorsize);
btrfs_info_in_rcu(fs_info, "new size for %s is %llu",
rcu_str_deref(device->name), new_size);
if (new_size > old_size) {
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_free;
}
ret = btrfs_grow_device(trans, device, new_size);
btrfs_commit_transaction(trans);
} else if (new_size < old_size) {
ret = btrfs_shrink_device(device, new_size);
} /* equal, nothing need to do */
out_free:
kfree(vol_args);
out:
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
mnt_drop_write_file(file);
return ret;
}
static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
const char *name, unsigned long fd, int subvol,
u64 *transid, bool readonly,
struct btrfs_qgroup_inherit *inherit)
{
int namelen;
int ret = 0;
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
ret = mnt_want_write_file(file);
if (ret)
goto out;
namelen = strlen(name);
if (strchr(name, '/')) {
ret = -EINVAL;
goto out_drop_write;
}
if (name[0] == '.' &&
(namelen == 1 || (name[1] == '.' && namelen == 2))) {
ret = -EEXIST;
goto out_drop_write;
}
if (subvol) {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
NULL, transid, readonly, inherit);
} else {
struct fd src = fdget(fd);
struct inode *src_inode;
if (!src.file) {
ret = -EINVAL;
goto out_drop_write;
}
src_inode = file_inode(src.file);
if (src_inode->i_sb != file_inode(file)->i_sb) {
btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
"Snapshot src from another FS");
ret = -EXDEV;
} else if (!inode_owner_or_capable(src_inode)) {
/*
* Subvolume creation is not restricted, but snapshots
* are limited to own subvolumes only
*/
ret = -EPERM;
} else {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
transid, readonly, inherit);
}
fdput(src);
}
out_drop_write:
mnt_drop_write_file(file);
out:
return ret;
}
static noinline int btrfs_ioctl_snap_create(struct file *file,
void __user *arg, int subvol)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol,
NULL, false, NULL);
kfree(vol_args);
return ret;
}
static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
void __user *arg, int subvol)
{
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
u64 transid = 0;
u64 *ptr = NULL;
bool readonly = false;
struct btrfs_qgroup_inherit *inherit = NULL;
if (!S_ISDIR(file_inode(file)->i_mode))
return -ENOTDIR;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
if (vol_args->flags &
~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
BTRFS_SUBVOL_QGROUP_INHERIT)) {
ret = -EOPNOTSUPP;
goto free_args;
}
if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) {
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
btrfs_warn(fs_info,
"SNAP_CREATE_V2 ioctl with CREATE_ASYNC is deprecated and will be removed in kernel 5.7");
ptr = &transid;
}
if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
readonly = true;
if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (vol_args->size > PAGE_SIZE) {
ret = -EINVAL;
goto free_args;
}
inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
if (IS_ERR(inherit)) {
ret = PTR_ERR(inherit);
goto free_args;
}
}
ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
vol_args->fd, subvol, ptr,
readonly, inherit);
if (ret)
goto free_inherit;
if (ptr && copy_to_user(arg +
offsetof(struct btrfs_ioctl_vol_args_v2,
transid),
ptr, sizeof(*ptr)))
ret = -EFAULT;
free_inherit:
kfree(inherit);
free_args:
kfree(vol_args);
return ret;
}
static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret = 0;
u64 flags = 0;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
return -EINVAL;
down_read(&fs_info->subvol_sem);
if (btrfs_root_readonly(root))
flags |= BTRFS_SUBVOL_RDONLY;
up_read(&fs_info->subvol_sem);
if (copy_to_user(arg, &flags, sizeof(flags)))
ret = -EFAULT;
return ret;
}
static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
u64 root_flags;
u64 flags;
int ret = 0;
if (!inode_owner_or_capable(inode))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
goto out;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
goto out_drop_write;
}
if (copy_from_user(&flags, arg, sizeof(flags))) {
ret = -EFAULT;
goto out_drop_write;
}
if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
ret = -EINVAL;
goto out_drop_write;
}
if (flags & ~BTRFS_SUBVOL_RDONLY) {
ret = -EOPNOTSUPP;
goto out_drop_write;
}
down_write(&fs_info->subvol_sem);
/* nothing to do */
if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
goto out_drop_sem;
root_flags = btrfs_root_flags(&root->root_item);
if (flags & BTRFS_SUBVOL_RDONLY) {
btrfs_set_root_flags(&root->root_item,
root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
} else {
/*
* Block RO -> RW transition if this subvolume is involved in
* send
*/
spin_lock(&root->root_item_lock);
if (root->send_in_progress == 0) {
btrfs_set_root_flags(&root->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
spin_unlock(&root->root_item_lock);
} else {
spin_unlock(&root->root_item_lock);
btrfs_warn(fs_info,
"Attempt to set subvolume %llu read-write during send",
root->root_key.objectid);
ret = -EPERM;
goto out_drop_sem;
}
}
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_reset;
}
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
if (ret < 0) {
btrfs_end_transaction(trans);
goto out_reset;
}
ret = btrfs_commit_transaction(trans);
out_reset:
if (ret)
btrfs_set_root_flags(&root->root_item, root_flags);
out_drop_sem:
up_write(&fs_info->subvol_sem);
out_drop_write:
mnt_drop_write_file(file);
out:
return ret;
}
static noinline int key_in_sk(struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk)
{
struct btrfs_key test;
int ret;
test.objectid = sk->min_objectid;
test.type = sk->min_type;
test.offset = sk->min_offset;
ret = btrfs_comp_cpu_keys(key, &test);
if (ret < 0)
return 0;
test.objectid = sk->max_objectid;
test.type = sk->max_type;
test.offset = sk->max_offset;
ret = btrfs_comp_cpu_keys(key, &test);
if (ret > 0)
return 0;
return 1;
}
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
size_t *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
{
u64 found_transid;
struct extent_buffer *leaf;
struct btrfs_ioctl_search_header sh;
struct btrfs_key test;
unsigned long item_off;
unsigned long item_len;
int nritems;
int i;
int slot;
int ret = 0;
leaf = path->nodes[0];
slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
if (btrfs_header_generation(leaf) > sk->max_transid) {
i = nritems;
goto advance_key;
}
found_transid = btrfs_header_generation(leaf);
for (i = slot; i < nritems; i++) {
item_off = btrfs_item_ptr_offset(leaf, i);
item_len = btrfs_item_size_nr(leaf, i);
btrfs_item_key_to_cpu(leaf, key, i);
if (!key_in_sk(key, sk))
continue;
if (sizeof(sh) + item_len > *buf_size) {
if (*num_found) {
ret = 1;
goto out;
}
/*
* return one empty item back for v1, which does not
* handle -EOVERFLOW
*/
*buf_size = sizeof(sh) + item_len;
item_len = 0;
ret = -EOVERFLOW;
}
if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
ret = 1;
goto out;
}
sh.objectid = key->objectid;
sh.offset = key->offset;
sh.type = key->type;
sh.len = item_len;
sh.transid = found_transid;
/* copy search result header */
if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
ret = -EFAULT;
goto out;
}
*sk_offset += sizeof(sh);
if (item_len) {
char __user *up = ubuf + *sk_offset;
/* copy the item */
if (read_extent_buffer_to_user(leaf, up,
item_off, item_len)) {
ret = -EFAULT;
goto out;
}
*sk_offset += item_len;
}
(*num_found)++;
if (ret) /* -EOVERFLOW from above */
goto out;
if (*num_found >= sk->nr_items) {
ret = 1;
goto out;
}
}
advance_key:
ret = 0;
test.objectid = sk->max_objectid;
test.type = sk->max_type;
test.offset = sk->max_offset;
if (btrfs_comp_cpu_keys(key, &test) >= 0)
ret = 1;
else if (key->offset < (u64)-1)
key->offset++;
else if (key->type < (u8)-1) {
key->offset = 0;
key->type++;
} else if (key->objectid < (u64)-1) {
key->offset = 0;
key->type = 0;
key->objectid++;
} else
ret = 1;
out:
/*
* 0: all items from this leaf copied, continue with next
* 1: * more items can be copied, but unused buffer is too small
* * all items were found
* Either way, it will stops the loop which iterates to the next
* leaf
* -EOVERFLOW: item was to large for buffer
* -EFAULT: could not copy extent buffer back to userspace
*/
return ret;
}
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
size_t *buf_size,
char __user *ubuf)
{
struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
struct btrfs_root *root;
struct btrfs_key key;
struct btrfs_path *path;
int ret;
int num_found = 0;
unsigned long sk_offset = 0;
if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
*buf_size = sizeof(struct btrfs_ioctl_search_header);
return -EOVERFLOW;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
if (sk->tree_id == 0) {
/* search the root of the inode that was passed */
root = BTRFS_I(inode)->root;
} else {
key.objectid = sk->tree_id;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
btrfs_free_path(path);
return PTR_ERR(root);
}
}
key.objectid = sk->min_objectid;
key.type = sk->min_type;
key.offset = sk->min_offset;
while (1) {
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
ret = 0;
goto err;
}
ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
&sk_offset, &num_found);
btrfs_release_path(path);
if (ret)
break;
}
if (ret > 0)
ret = 0;
err:
sk->nr_items = num_found;
btrfs_free_path(path);
return ret;
}
static noinline int btrfs_ioctl_tree_search(struct file *file,
void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs;
struct btrfs_ioctl_search_key sk;
struct inode *inode;
int ret;
size_t buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
uargs = (struct btrfs_ioctl_search_args __user *)argp;
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
buf_size = sizeof(uargs->buf);
inode = file_inode(file);
ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
/*
* In the origin implementation an overflow is handled by returning a
* search header with a len of zero, so reset ret.
*/
if (ret == -EOVERFLOW)
ret = 0;
if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
ret = -EFAULT;
return ret;
}
static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg;
struct btrfs_ioctl_search_args_v2 args;
struct inode *inode;
int ret;
size_t buf_size;
const size_t buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* copy search header and buffer size */
uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
buf_size = args.buf_size;
/* limit result size to 16MB */
if (buf_size > buf_limit)
buf_size = buf_limit;
inode = file_inode(file);
ret = search_ioctl(inode, &args.key, &buf_size,
(char __user *)(&uarg->buf[0]));
if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
ret = -EFAULT;
else if (ret == -EOVERFLOW &&
copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
ret = -EFAULT;
return ret;
}
/*
* Search INODE_REFs to identify path name of 'dirid' directory
* in a 'tree_id' tree. and sets path name to 'name'.
*/
static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
u64 tree_id, u64 dirid, char *name)
{
struct btrfs_root *root;
struct btrfs_key key;
char *ptr;
int ret = -1;
int slot;
int len;
int total_len = 0;
struct btrfs_inode_ref *iref;
struct extent_buffer *l;
struct btrfs_path *path;
if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
name[0]='\0';
return 0;
}
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
key.objectid = tree_id;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(info, &key);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
else if (ret > 0) {
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_REF_KEY);
if (ret < 0)
goto out;
else if (ret > 0) {
ret = -ENOENT;
goto out;
}
}
l = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(l, &key, slot);
iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(l, iref);
ptr -= len + 1;
total_len += len + 1;
if (ptr < name) {
ret = -ENAMETOOLONG;
goto out;
}
*(ptr + len) = '/';
read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
break;
btrfs_release_path(path);
key.objectid = key.offset;
key.offset = (u64)-1;
dirid = key.objectid;
}
memmove(name, ptr, total_len);
name[total_len] = '\0';
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
static int btrfs_search_path_in_tree_user(struct inode *inode,
struct btrfs_ioctl_ino_lookup_user_args *args)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct super_block *sb = inode->i_sb;
struct btrfs_key upper_limit = BTRFS_I(inode)->location;
u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
u64 dirid = args->dirid;
unsigned long item_off;
unsigned long item_len;
struct btrfs_inode_ref *iref;
struct btrfs_root_ref *rref;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key, key2;
struct extent_buffer *leaf;
struct inode *temp_inode;
char *ptr;
int slot;
int len;
int total_len = 0;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
/*
* If the bottom subvolume does not exist directly under upper_limit,
* construct the path in from the bottom up.
*/
if (dirid != upper_limit.objectid) {
ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
key.objectid = treeid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
key.objectid = dirid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = (u64)-1;
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_REF_KEY);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = -ENOENT;
goto out;
}
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
len = btrfs_inode_ref_name_len(leaf, iref);
ptr -= len + 1;
total_len += len + 1;
if (ptr < args->path) {
ret = -ENAMETOOLONG;
goto out;
}
*(ptr + len) = '/';
read_extent_buffer(leaf, ptr,
(unsigned long)(iref + 1), len);
/* Check the read+exec permission of this directory */
ret = btrfs_previous_item(root, path, dirid,
BTRFS_INODE_ITEM_KEY);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key2, slot);
if (key2.objectid != dirid) {
ret = -ENOENT;
goto out;
}
temp_inode = btrfs_iget(sb, &key2, root);
if (IS_ERR(temp_inode)) {
ret = PTR_ERR(temp_inode);
goto out;
}
ret = inode_permission(temp_inode, MAY_READ | MAY_EXEC);
iput(temp_inode);
if (ret) {
ret = -EACCES;
goto out;
}
if (key.offset == upper_limit.objectid)
break;
if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
ret = -EACCES;
goto out;
}
btrfs_release_path(path);
key.objectid = key.offset;
key.offset = (u64)-1;
dirid = key.objectid;
}
memmove(args->path, ptr, total_len);
args->path[total_len] = '\0';
btrfs_release_path(path);
}
/* Get the bottom subvolume's name from ROOT_REF */
root = fs_info->tree_root;
key.objectid = treeid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = args->treeid;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = -ENOENT;
goto out;
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
item_off = btrfs_item_ptr_offset(leaf, slot);
item_len = btrfs_item_size_nr(leaf, slot);
/* Check if dirid in ROOT_REF corresponds to passed dirid */
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
ret = -EINVAL;
goto out;
}
/* Copy subvolume's name */
item_off += sizeof(struct btrfs_root_ref);
item_len -= sizeof(struct btrfs_root_ref);
read_extent_buffer(leaf, args->name, item_off, item_len);
args->name[item_len] = 0;
out:
btrfs_free_path(path);
return ret;
}
static noinline int btrfs_ioctl_ino_lookup(struct file *file,
void __user *argp)
{
struct btrfs_ioctl_ino_lookup_args *args;
struct inode *inode;
int ret = 0;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
return PTR_ERR(args);
inode = file_inode(file);
/*
* Unprivileged query to obtain the containing subvolume root id. The
* path is reset so it's consistent with btrfs_search_path_in_tree.
*/
if (args->treeid == 0)
args->treeid = BTRFS_I(inode)->root->root_key.objectid;
if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
args->name[0] = 0;
goto out;
}
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out;
}
ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
args->treeid, args->objectid,
args->name);
out:
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
kfree(args);
return ret;
}
/*
* Version of ino_lookup ioctl (unprivileged)
*
* The main differences from ino_lookup ioctl are:
*
* 1. Read + Exec permission will be checked using inode_permission() during
* path construction. -EACCES will be returned in case of failure.
* 2. Path construction will be stopped at the inode number which corresponds
* to the fd with which this ioctl is called. If constructed path does not
* exist under fd's inode, -EACCES will be returned.
* 3. The name of bottom subvolume is also searched and filled.
*/
static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
{
struct btrfs_ioctl_ino_lookup_user_args *args;
struct inode *inode;
int ret;
args = memdup_user(argp, sizeof(*args));
if (IS_ERR(args))
return PTR_ERR(args);
inode = file_inode(file);
if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
/*
* The subvolume does not exist under fd with which this is
* called
*/
kfree(args);
return -EACCES;
}
ret = btrfs_search_path_in_tree_user(inode, args);
if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
ret = -EFAULT;
kfree(args);
return ret;
}
/* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
{
struct btrfs_ioctl_get_subvol_info_args *subvol_info;
struct btrfs_fs_info *fs_info;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_root_item *root_item;
struct btrfs_root_ref *rref;
struct extent_buffer *leaf;
unsigned long item_off;
unsigned long item_len;
struct inode *inode;
int slot;
int ret = 0;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
if (!subvol_info) {
btrfs_free_path(path);
return -ENOMEM;
}
inode = file_inode(file);
fs_info = BTRFS_I(inode)->root->fs_info;
/* Get root_item of inode's subvolume */
key.objectid = BTRFS_I(inode)->root->root_key.objectid;
key.type = BTRFS_ROOT_ITEM_KEY;
key.offset = (u64)-1;
root = btrfs_read_fs_root_no_name(fs_info, &key);
if (IS_ERR(root)) {
ret = PTR_ERR(root);
goto out;
}
root_item = &root->root_item;
subvol_info->treeid = key.objectid;
subvol_info->generation = btrfs_root_generation(root_item);
subvol_info->flags = btrfs_root_flags(root_item);
memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
BTRFS_UUID_SIZE);
memcpy(subvol_info->received_uuid, root_item->received_uuid,
BTRFS_UUID_SIZE);
subvol_info->ctransid = btrfs_root_ctransid(root_item);
subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
subvol_info->otransid = btrfs_root_otransid(root_item);
subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
subvol_info->stransid = btrfs_root_stransid(root_item);
subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
subvol_info->rtransid = btrfs_root_rtransid(root_item);
subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
/* Search root tree for ROOT_BACKREF of this subvolume */
root = fs_info->tree_root;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = -EUCLEAN;
goto out;
}
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid == subvol_info->treeid &&
key.type == BTRFS_ROOT_BACKREF_KEY) {
subvol_info->parent_id = key.offset;
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
item_off = btrfs_item_ptr_offset(leaf, slot)
+ sizeof(struct btrfs_root_ref);
item_len = btrfs_item_size_nr(leaf, slot)
- sizeof(struct btrfs_root_ref);
read_extent_buffer(leaf, subvol_info->name,
item_off, item_len);
} else {
ret = -ENOENT;
goto out;
}
}
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
out:
btrfs_free_path(path);
kzfree(subvol_info);
return ret;
}
/*
* Return ROOT_REF information of the subvolume containing this inode
* except the subvolume name.
*/
static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
{
struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
struct btrfs_root_ref *rref;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *leaf;
struct inode *inode;
u64 objectid;
int slot;
int ret;
u8 found;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
rootrefs = memdup_user(argp, sizeof(*rootrefs));
if (IS_ERR(rootrefs)) {
btrfs_free_path(path);
return PTR_ERR(rootrefs);
}
inode = file_inode(file);
root = BTRFS_I(inode)->root->fs_info->tree_root;
objectid = BTRFS_I(inode)->root->root_key.objectid;
key.objectid = objectid;
key.type = BTRFS_ROOT_REF_KEY;
key.offset = rootrefs->min_treeid;
found = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
goto out;
} else if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = -EUCLEAN;
goto out;
}
}
while (1) {
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
ret = 0;
goto out;
}
if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
ret = -EOVERFLOW;
goto out;
}
rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
rootrefs->rootref[found].treeid = key.offset;
rootrefs->rootref[found].dirid =
btrfs_root_ref_dirid(leaf, rref);
found++;
ret = btrfs_next_item(root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = -EUCLEAN;
goto out;
}
}
out:
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
if (found)
rootrefs->min_treeid =
rootrefs->rootref[found - 1].treeid + 1;
if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
ret = -EFAULT;
}
kfree(rootrefs);
btrfs_free_path(path);
return ret;
}
static noinline int btrfs_ioctl_snap_destroy(struct file *file,
void __user *arg)
{
struct dentry *parent = file->f_path.dentry;
struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
struct dentry *dentry;
struct inode *dir = d_inode(parent);
struct inode *inode;
struct btrfs_root *root = BTRFS_I(dir)->root;
struct btrfs_root *dest = NULL;
struct btrfs_ioctl_vol_args *vol_args;
int namelen;
int err = 0;
if (!S_ISDIR(dir->i_mode))
return -ENOTDIR;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args))
return PTR_ERR(vol_args);
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
namelen = strlen(vol_args->name);
if (strchr(vol_args->name, '/') ||
strncmp(vol_args->name, "..", namelen) == 0) {
err = -EINVAL;
goto out;
}
err = mnt_want_write_file(file);
if (err)
goto out;
err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
if (err == -EINTR)
goto out_drop_write;
dentry = lookup_one_len(vol_args->name, parent, namelen);
if (IS_ERR(dentry)) {
err = PTR_ERR(dentry);
goto out_unlock_dir;
}
if (d_really_is_negative(dentry)) {
err = -ENOENT;
goto out_dput;
}
inode = d_inode(dentry);
dest = BTRFS_I(inode)->root;
if (!capable(CAP_SYS_ADMIN)) {
/*
* Regular user. Only allow this with a special mount
* option, when the user has write+exec access to the
* subvol root, and when rmdir(2) would have been
* allowed.
*
* Note that this is _not_ check that the subvol is
* empty or doesn't contain data that we wouldn't
* otherwise be able to delete.
*
* Users who want to delete empty subvols should try
* rmdir(2).
*/
err = -EPERM;
if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
goto out_dput;
/*
* Do not allow deletion if the parent dir is the same
* as the dir to be deleted. That means the ioctl
* must be called on the dentry referencing the root
* of the subvol, not a random directory contained
* within it.
*/
err = -EINVAL;
if (root == dest)
goto out_dput;
err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
if (err)
goto out_dput;
}
/* check if subvolume may be deleted by a user */
err = btrfs_may_delete(dir, dentry, 1);
if (err)
goto out_dput;
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
err = -EINVAL;
goto out_dput;
}
inode_lock(inode);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
if (!err) {
fsnotify_rmdir(dir, dentry);
d_delete(dentry);
}
out_dput:
dput(dentry);
out_unlock_dir:
inode_unlock(dir);
out_drop_write:
mnt_drop_write_file(file);
out:
kfree(vol_args);
return err;
}
static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_defrag_range_args *range;
int ret;
ret = mnt_want_write_file(file);
if (ret)
return ret;
if (btrfs_root_readonly(root)) {
ret = -EROFS;
goto out;
}
switch (inode->i_mode & S_IFMT) {
case S_IFDIR:
if (!capable(CAP_SYS_ADMIN)) {
ret = -EPERM;
goto out;
}
ret = btrfs_defrag_root(root);
break;
case S_IFREG:
/*
* Note that this does not check the file descriptor for write
* access. This prevents defragmenting executables that are
* running and allows defrag on files open in read-only mode.
*/
if (!capable(CAP_SYS_ADMIN) &&
inode_permission(inode, MAY_WRITE)) {
ret = -EPERM;
goto out;
}
range = kzalloc(sizeof(*range), GFP_KERNEL);
if (!range) {
ret = -ENOMEM;
goto out;
}
if (argp) {
if (copy_from_user(range, argp,
sizeof(*range))) {
ret = -EFAULT;
kfree(range);
goto out;
}
/* compression requires us to start the IO */
if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
range->extent_thresh = (u32)-1;
}
} else {
/* the rest are all set to zero by kzalloc */
range->len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
kfree(range);
break;
default:
ret = -EINVAL;
}
out:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
{
struct btrfs_ioctl_vol_args *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
goto out;
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_init_new_device(fs_info, vol_args->name);
if (!ret)
btrfs_info(fs_info, "disk added %s", vol_args->name);
kfree(vol_args);
out:
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
return ret;
}
static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args_v2 *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
goto err_drop;
}
/* Check for compatibility reject unknown flags */
if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) {
ret = -EOPNOTSUPP;
goto out;
}
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
ret = btrfs_rm_device(fs_info, NULL, vol_args->devid);
} else {
vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
}
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
if (!ret) {
if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
btrfs_info(fs_info, "device deleted: id %llu",
vol_args->devid);
else
btrfs_info(fs_info, "device deleted: %s",
vol_args->name);
}
out:
kfree(vol_args);
err_drop:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_vol_args *vol_args;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out_drop_write;
}
vol_args = memdup_user(arg, sizeof(*vol_args));
if (IS_ERR(vol_args)) {
ret = PTR_ERR(vol_args);
goto out;
}
vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
ret = btrfs_rm_device(fs_info, vol_args->name, 0);
if (!ret)
btrfs_info(fs_info, "disk deleted %s", vol_args->name);
kfree(vol_args);
out:
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out_drop_write:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_fs_info_args *fi_args;
struct btrfs_device *device;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
int ret = 0;
fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
if (!fi_args)
return -ENOMEM;
rcu_read_lock();
fi_args->num_devices = fs_devices->num_devices;
list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
if (device->devid > fi_args->max_id)
fi_args->max_id = device->devid;
}
rcu_read_unlock();
memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
fi_args->nodesize = fs_info->nodesize;
fi_args->sectorsize = fs_info->sectorsize;
fi_args->clone_alignment = fs_info->sectorsize;
if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
ret = -EFAULT;
kfree(fi_args);
return ret;
}
static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_dev_info_args *di_args;
struct btrfs_device *dev;
int ret = 0;
char *s_uuid = NULL;
di_args = memdup_user(arg, sizeof(*di_args));
if (IS_ERR(di_args))
return PTR_ERR(di_args);
if (!btrfs_is_empty_uuid(di_args->uuid))
s_uuid = di_args->uuid;
rcu_read_lock();
dev = btrfs_find_device(fs_info->fs_devices, di_args->devid, s_uuid,
NULL, true);
if (!dev) {
ret = -ENODEV;
goto out;
}
di_args->devid = dev->devid;
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
if (dev->name) {
strncpy(di_args->path, rcu_str_deref(dev->name),
sizeof(di_args->path) - 1);
di_args->path[sizeof(di_args->path) - 1] = 0;
} else {
di_args->path[0] = '\0';
}
out:
rcu_read_unlock();
if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
ret = -EFAULT;
kfree(di_args);
return ret;
}
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
} else if (inode1 == inode2 && loff2 < loff1) {
swap(loff1, loff2);
}
lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
struct inode *dst, u64 dst_loff)
{
int ret;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
/*
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
*/
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
}
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
struct inode *dst, u64 dst_loff)
{
int ret;
u64 i, tail_len, chunk_count;
Btrfs: fix race between send and deduplication that lead to failures and crashes Send operates on read only trees and expects them to never change while it is using them. This is part of its initial design, and this expection is due to two different reasons: 1) When it was introduced, no operations were allowed to modifiy read-only subvolumes/snapshots (including defrag for example). 2) It keeps send from having an impact on other filesystem operations. Namely send does not need to keep locks on the trees nor needs to hold on to transaction handles and delay transaction commits. This ends up being a consequence of the former reason. However the deduplication feature was introduced later (on September 2013, while send was introduced in July 2012) and it allowed for deduplication with destination files that belong to read-only trees (subvolumes and snapshots). That means that having a send operation (either full or incremental) running in parallel with a deduplication that has the destination inode in one of the trees used by the send operation, can result in tree nodes and leaves getting freed and reused while send is using them. This problem is similar to the problem solved for the root nodes getting freed and reused when a snapshot is made against one tree that is currenly being used by a send operation, fixed in commits [1] and [2]. These commits explain in detail how the problem happens and the explanation is valid for any node or leaf that is not the root of a tree as well. This problem was also discussed and explained recently in a thread [3]. The problem is very easy to reproduce when using send with large trees (snapshots) and just a few concurrent deduplication operations that target files in the trees used by send. A stress test case is being sent for fstests that triggers the issue easily. The most common error to hit is the send ioctl return -EIO with the following messages in dmesg/syslog: [1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400 [1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27 The first one is very easy to hit while the second one happens much less frequently, except for very large trees (in that test case, snapshots with 100000 files having large xattrs to get deep and wide trees). Less frequently, at least one BUG_ON can be hit: [1631742.130080] ------------[ cut here ]------------ [1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806! [1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI [1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1 [1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs] (...) [1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246 [1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002 [1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000 [1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d [1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708 [1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 [1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000 [1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0 [1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1631742.141272] Call Trace: [1631742.141826] ? do_raw_spin_unlock+0x49/0xc0 [1631742.142390] tree_advance+0x173/0x1d0 [btrfs] [1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs] [1631742.143533] ? process_extent+0x1070/0x1070 [btrfs] [1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs] [1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs] [1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0 [1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs] [1631742.146179] ? account_entity_enqueue+0xd3/0x100 [1631742.146662] ? reweight_entity+0x154/0x1a0 [1631742.147135] ? update_curr+0x20/0x2a0 [1631742.147593] ? check_preempt_wakeup+0x103/0x250 [1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0 [1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [1631742.148942] do_vfs_ioctl+0xa2/0x6f0 [1631742.149361] ? __fget+0x113/0x200 [1631742.149767] ksys_ioctl+0x70/0x80 [1631742.150159] __x64_sys_ioctl+0x16/0x20 [1631742.150543] do_syscall_64+0x60/0x1b0 [1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe [1631742.151326] RIP: 0033:0x7f4cd9f5add7 (...) [1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7 [1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007 [1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700 [1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003 [1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001 (...) [1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]--- That BUG_ON happens because while send is using a node, that node is COWed by a concurrent deduplication, gets freed and gets reused as a leaf (because a transaction commit happened in between), so when it attempts to read a slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer contents were wiped out and it now matches a leaf (which can even belong to some other tree now), hitting the BUG_ON(level == 0). Fix this concurrency issue by not allowing send and deduplication to run in parallel if both operate on the same readonly trees, returning EAGAIN to user space and logging an exlicit warning in dmesg/syslog. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2 [3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 23:43:42 +08:00
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
spin_lock(&root_dst->root_item_lock);
if (root_dst->send_in_progress) {
btrfs_warn_rl(root_dst->fs_info,
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
root_dst->root_key.objectid,
root_dst->send_in_progress);
spin_unlock(&root_dst->root_item_lock);
return -EAGAIN;
}
root_dst->dedupe_in_progress++;
spin_unlock(&root_dst->root_item_lock);
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
for (i = 0; i < chunk_count; i++) {
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
dst, dst_loff);
if (ret)
Btrfs: fix race between send and deduplication that lead to failures and crashes Send operates on read only trees and expects them to never change while it is using them. This is part of its initial design, and this expection is due to two different reasons: 1) When it was introduced, no operations were allowed to modifiy read-only subvolumes/snapshots (including defrag for example). 2) It keeps send from having an impact on other filesystem operations. Namely send does not need to keep locks on the trees nor needs to hold on to transaction handles and delay transaction commits. This ends up being a consequence of the former reason. However the deduplication feature was introduced later (on September 2013, while send was introduced in July 2012) and it allowed for deduplication with destination files that belong to read-only trees (subvolumes and snapshots). That means that having a send operation (either full or incremental) running in parallel with a deduplication that has the destination inode in one of the trees used by the send operation, can result in tree nodes and leaves getting freed and reused while send is using them. This problem is similar to the problem solved for the root nodes getting freed and reused when a snapshot is made against one tree that is currenly being used by a send operation, fixed in commits [1] and [2]. These commits explain in detail how the problem happens and the explanation is valid for any node or leaf that is not the root of a tree as well. This problem was also discussed and explained recently in a thread [3]. The problem is very easy to reproduce when using send with large trees (snapshots) and just a few concurrent deduplication operations that target files in the trees used by send. A stress test case is being sent for fstests that triggers the issue easily. The most common error to hit is the send ioctl return -EIO with the following messages in dmesg/syslog: [1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400 [1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27 The first one is very easy to hit while the second one happens much less frequently, except for very large trees (in that test case, snapshots with 100000 files having large xattrs to get deep and wide trees). Less frequently, at least one BUG_ON can be hit: [1631742.130080] ------------[ cut here ]------------ [1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806! [1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI [1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1 [1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs] (...) [1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246 [1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002 [1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000 [1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d [1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708 [1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 [1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000 [1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0 [1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1631742.141272] Call Trace: [1631742.141826] ? do_raw_spin_unlock+0x49/0xc0 [1631742.142390] tree_advance+0x173/0x1d0 [btrfs] [1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs] [1631742.143533] ? process_extent+0x1070/0x1070 [btrfs] [1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs] [1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs] [1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0 [1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs] [1631742.146179] ? account_entity_enqueue+0xd3/0x100 [1631742.146662] ? reweight_entity+0x154/0x1a0 [1631742.147135] ? update_curr+0x20/0x2a0 [1631742.147593] ? check_preempt_wakeup+0x103/0x250 [1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0 [1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [1631742.148942] do_vfs_ioctl+0xa2/0x6f0 [1631742.149361] ? __fget+0x113/0x200 [1631742.149767] ksys_ioctl+0x70/0x80 [1631742.150159] __x64_sys_ioctl+0x16/0x20 [1631742.150543] do_syscall_64+0x60/0x1b0 [1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe [1631742.151326] RIP: 0033:0x7f4cd9f5add7 (...) [1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7 [1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007 [1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700 [1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003 [1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001 (...) [1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]--- That BUG_ON happens because while send is using a node, that node is COWed by a concurrent deduplication, gets freed and gets reused as a leaf (because a transaction commit happened in between), so when it attempts to read a slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer contents were wiped out and it now matches a leaf (which can even belong to some other tree now), hitting the BUG_ON(level == 0). Fix this concurrency issue by not allowing send and deduplication to run in parallel if both operate on the same readonly trees, returning EAGAIN to user space and logging an exlicit warning in dmesg/syslog. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2 [3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 23:43:42 +08:00
goto out;
loff += BTRFS_MAX_DEDUPE_LEN;
dst_loff += BTRFS_MAX_DEDUPE_LEN;
}
if (tail_len > 0)
ret = btrfs_extent_same_range(src, loff, tail_len, dst,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
dst_loff);
Btrfs: fix race between send and deduplication that lead to failures and crashes Send operates on read only trees and expects them to never change while it is using them. This is part of its initial design, and this expection is due to two different reasons: 1) When it was introduced, no operations were allowed to modifiy read-only subvolumes/snapshots (including defrag for example). 2) It keeps send from having an impact on other filesystem operations. Namely send does not need to keep locks on the trees nor needs to hold on to transaction handles and delay transaction commits. This ends up being a consequence of the former reason. However the deduplication feature was introduced later (on September 2013, while send was introduced in July 2012) and it allowed for deduplication with destination files that belong to read-only trees (subvolumes and snapshots). That means that having a send operation (either full or incremental) running in parallel with a deduplication that has the destination inode in one of the trees used by the send operation, can result in tree nodes and leaves getting freed and reused while send is using them. This problem is similar to the problem solved for the root nodes getting freed and reused when a snapshot is made against one tree that is currenly being used by a send operation, fixed in commits [1] and [2]. These commits explain in detail how the problem happens and the explanation is valid for any node or leaf that is not the root of a tree as well. This problem was also discussed and explained recently in a thread [3]. The problem is very easy to reproduce when using send with large trees (snapshots) and just a few concurrent deduplication operations that target files in the trees used by send. A stress test case is being sent for fstests that triggers the issue easily. The most common error to hit is the send ioctl return -EIO with the following messages in dmesg/syslog: [1631617.204075] BTRFS error (device sdc): did not find backref in send_root. inode=63292, offset=0, disk_byte=5228134400 found extent=5228134400 [1631633.251754] BTRFS error (device sdc): parent transid verify failed on 32243712 wanted 24 found 27 The first one is very easy to hit while the second one happens much less frequently, except for very large trees (in that test case, snapshots with 100000 files having large xattrs to get deep and wide trees). Less frequently, at least one BUG_ON can be hit: [1631742.130080] ------------[ cut here ]------------ [1631742.130625] kernel BUG at fs/btrfs/ctree.c:1806! [1631742.131188] invalid opcode: 0000 [#6] SMP DEBUG_PAGEALLOC PTI [1631742.131726] CPU: 1 PID: 13394 Comm: btrfs Tainted: G B D W 5.0.0-rc8-btrfs-next-45 #1 [1631742.132265] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [1631742.133399] RIP: 0010:read_node_slot+0x122/0x130 [btrfs] (...) [1631742.135061] RSP: 0018:ffffb530021ebaa0 EFLAGS: 00010246 [1631742.135615] RAX: ffff93ac8912e000 RBX: 000000000000009d RCX: 0000000000000002 [1631742.136173] RDX: 000000000000009d RSI: ffff93ac564b0d08 RDI: ffff93ad5b48c000 [1631742.136759] RBP: ffffb530021ebb7d R08: 0000000000000001 R09: ffffb530021ebb7d [1631742.137324] R10: ffffb530021eba70 R11: 0000000000000000 R12: ffff93ac87d0a708 [1631742.137900] R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000001 [1631742.138455] FS: 00007f4cdb1528c0(0000) GS:ffff93ad76a80000(0000) knlGS:0000000000000000 [1631742.139010] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1631742.139568] CR2: 00007f5acb3d0420 CR3: 000000012be3e006 CR4: 00000000003606e0 [1631742.140131] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1631742.140719] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1631742.141272] Call Trace: [1631742.141826] ? do_raw_spin_unlock+0x49/0xc0 [1631742.142390] tree_advance+0x173/0x1d0 [btrfs] [1631742.142948] btrfs_compare_trees+0x268/0x690 [btrfs] [1631742.143533] ? process_extent+0x1070/0x1070 [btrfs] [1631742.144088] btrfs_ioctl_send+0x1037/0x1270 [btrfs] [1631742.144645] _btrfs_ioctl_send+0x80/0x110 [btrfs] [1631742.145161] ? trace_sched_stick_numa+0xe0/0xe0 [1631742.145685] btrfs_ioctl+0x13fe/0x3120 [btrfs] [1631742.146179] ? account_entity_enqueue+0xd3/0x100 [1631742.146662] ? reweight_entity+0x154/0x1a0 [1631742.147135] ? update_curr+0x20/0x2a0 [1631742.147593] ? check_preempt_wakeup+0x103/0x250 [1631742.148053] ? do_vfs_ioctl+0xa2/0x6f0 [1631742.148510] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [1631742.148942] do_vfs_ioctl+0xa2/0x6f0 [1631742.149361] ? __fget+0x113/0x200 [1631742.149767] ksys_ioctl+0x70/0x80 [1631742.150159] __x64_sys_ioctl+0x16/0x20 [1631742.150543] do_syscall_64+0x60/0x1b0 [1631742.150931] entry_SYSCALL_64_after_hwframe+0x49/0xbe [1631742.151326] RIP: 0033:0x7f4cd9f5add7 (...) [1631742.152509] RSP: 002b:00007ffe91017708 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [1631742.152892] RAX: ffffffffffffffda RBX: 0000000000000105 RCX: 00007f4cd9f5add7 [1631742.153268] RDX: 00007ffe91017790 RSI: 0000000040489426 RDI: 0000000000000007 [1631742.153633] RBP: 0000000000000007 R08: 00007f4cd9e79700 R09: 00007f4cd9e79700 [1631742.153999] R10: 00007f4cd9e799d0 R11: 0000000000000202 R12: 0000000000000003 [1631742.154365] R13: 0000555dfae53020 R14: 0000000000000000 R15: 0000000000000001 (...) [1631742.156696] ---[ end trace 5dac9f96dcc3fd6b ]--- That BUG_ON happens because while send is using a node, that node is COWed by a concurrent deduplication, gets freed and gets reused as a leaf (because a transaction commit happened in between), so when it attempts to read a slot from the extent buffer, at ctree.c:read_node_slot(), the extent buffer contents were wiped out and it now matches a leaf (which can even belong to some other tree now), hitting the BUG_ON(level == 0). Fix this concurrency issue by not allowing send and deduplication to run in parallel if both operate on the same readonly trees, returning EAGAIN to user space and logging an exlicit warning in dmesg/syslog. [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=be6821f82c3cc36e026f5afd10249988852b35ea [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=6f2f0b394b54e2b159ef969a0b5274e9bbf82ff2 [3] https://lore.kernel.org/linux-btrfs/CAL3q7H7iqSEEyFaEtpRZw3cp613y+4k2Q8b4W7mweR3tZA05bQ@mail.gmail.com/ CC: stable@vger.kernel.org # 4.4+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-04-22 23:43:42 +08:00
out:
spin_lock(&root_dst->root_item_lock);
root_dst->dedupe_in_progress--;
spin_unlock(&root_dst->root_item_lock);
return ret;
}
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
struct inode *inode,
u64 endoff,
const u64 destoff,
const u64 olen,
int no_time_update)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
int ret;
inode_inc_iversion(inode);
if (!no_time_update)
inode->i_mtime = inode->i_ctime = current_time(inode);
/*
* We round up to the block size at eof when determining which
* extents to clone above, but shouldn't round up the file size.
*/
if (endoff > destoff + olen)
endoff = destoff + olen;
if (endoff > inode->i_size)
btrfs_i_size_write(BTRFS_I(inode), endoff);
ret = btrfs_update_inode(trans, root, inode);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
ret = btrfs_end_transaction(trans);
out:
return ret;
}
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
/*
* Make sure we do not end up inserting an inline extent into a file that has
* already other (non-inline) extents. If a file has an inline extent it can
* not have any other extents and the (single) inline extent must start at the
* file offset 0. Failing to respect these rules will lead to file corruption,
* resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
*
* We can have extents that have been already written to disk or we can have
* dirty ranges still in delalloc, in which case the extent maps and items are
* created only when we run delalloc, and the delalloc ranges might fall outside
* the range we are currently locking in the inode's io tree. So we check the
* inode's i_size because of that (i_size updates are done while holding the
* i_mutex, which we are holding here).
* We also check to see if the inode has a size not greater than "datal" but has
* extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
* protected against such concurrent fallocate calls by the i_mutex).
*
* If the file has no extents but a size greater than datal, do not allow the
* copy because we would need turn the inline extent into a non-inline one (even
* with NO_HOLES enabled). If we find our destination inode only has one inline
* extent, just overwrite it with the source inline extent if its size is less
* than the source extent's size, or we could copy the source inline extent's
* data into the destination inode's inline extent if the later is greater then
* the former.
*/
static int clone_copy_inline_extent(struct inode *dst,
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
struct btrfs_trans_handle *trans,
struct btrfs_path *path,
struct btrfs_key *new_key,
const u64 drop_start,
const u64 datal,
const u64 skip,
const u64 size,
char *inline_data)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
struct btrfs_root *root = BTRFS_I(dst)->root;
const u64 aligned_end = ALIGN(new_key->offset + datal,
fs_info->sectorsize);
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
int ret;
struct btrfs_key key;
if (new_key->offset > 0)
return -EOPNOTSUPP;
key.objectid = btrfs_ino(BTRFS_I(dst));
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
return ret;
} else if (ret > 0) {
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ret;
else if (ret > 0)
goto copy_inline_extent;
}
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
key.type == BTRFS_EXTENT_DATA_KEY) {
ASSERT(key.offset > 0);
return -EOPNOTSUPP;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
u64 ext_len;
/*
* If the file size is <= datal, make sure there are no other
* extents following (can happen do to an fallocate call with
* the flag FALLOC_FL_KEEP_SIZE).
*/
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_file_extent_item);
/*
* If it's an inline extent, it can not have other extents
* following it.
*/
if (btrfs_file_extent_type(path->nodes[0], ei) ==
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
if (ext_len > aligned_end)
return -EOPNOTSUPP;
ret = btrfs_next_item(root, path);
if (ret < 0) {
return ret;
} else if (ret == 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0]);
if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
key.type == BTRFS_EXTENT_DATA_KEY)
return -EOPNOTSUPP;
}
}
copy_inline_extent:
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
*/
if (i_size_read(dst) > datal) {
/*
* If the destination inode has an inline extent...
* This would require copying the data from the source inline
* extent into the beginning of the destination's inline extent.
* But this is really complex, both extents can be compressed
* or just one of them, which would require decompressing and
* re-compressing data (which could increase the new compressed
* size, not allowing the compressed data to fit anymore in an
* inline extent).
* So just don't support this case for now (it should be rare,
* we are not really saving space when cloning inline extents).
*/
return -EOPNOTSUPP;
}
btrfs_release_path(path);
ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
if (ret)
return ret;
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
if (ret)
return ret;
if (skip) {
const u32 start = btrfs_file_extent_calc_inline_size(0);
memmove(inline_data + start, inline_data + start + skip, datal);
}
write_extent_buffer(path->nodes[0], inline_data,
btrfs_item_ptr_offset(path->nodes[0],
path->slots[0]),
size);
inode_add_bytes(dst, datal);
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
Btrfs: fix file corruption and data loss after cloning inline extents Currently the clone ioctl allows to clone an inline extent from one file to another that already has other (non-inlined) extents. This is a problem because btrfs is not designed to deal with files having inline and regular extents, if a file has an inline extent then it must be the only extent in the file and must start at file offset 0. Having a file with an inline extent followed by regular extents results in EIO errors when doing reads or writes against the first 4K of the file. Also, the clone ioctl allows one to lose data if the source file consists of a single inline extent, with a size of N bytes, and the destination file consists of a single inline extent with a size of M bytes, where we have M > N. In this case the clone operation removes the inline extent from the destination file and then copies the inline extent from the source file into the destination file - we lose the M - N bytes from the destination file, a read operation will get the value 0x00 for any bytes in the the range [N, M] (the destination inode's i_size remained as M, that's why we can read past N bytes). So fix this by not allowing such destructive operations to happen and return errno EOPNOTSUPP to user space. Currently the fstest btrfs/035 tests the data loss case but it totally ignores this - i.e. expects the operation to succeed and does not check the we got data loss. The following test case for fstests exercises all these cases that result in file corruption and data loss: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner _require_btrfs_fs_feature "no_holes" _require_btrfs_mkfs_feature "no-holes" rm -f $seqres.full test_cloning_inline_extents() { local mkfs_opts=$1 local mount_opts=$2 _scratch_mkfs $mkfs_opts >>$seqres.full 2>&1 _scratch_mount $mount_opts # File bar, the source for all the following clone operations, consists # of a single inline extent (50 bytes). $XFS_IO_PROG -f -c "pwrite -S 0xbb 0 50" $SCRATCH_MNT/bar \ | _filter_xfs_io # Test cloning into a file with an extent (non-inlined) where the # destination offset overlaps that extent. It should not be possible to # clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 16K" $SCRATCH_MNT/foo \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo data after clone operation:" # All bytes should have the value 0xaa (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo $XFS_IO_PROG -c "pwrite -S 0xcc 0 100" $SCRATCH_MNT/foo | _filter_xfs_io # Test cloning the inline extent against a file which has a hole in its # first 4K followed by a non-inlined extent. It should not be possible # as well to clone the inline extent from file bar into this file. $XFS_IO_PROG -f -c "pwrite -S 0xdd 4K 12K" $SCRATCH_MNT/foo2 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo2 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "File foo2 data after clone operation:" # All bytes should have the value 0x00 (clone operation failed and did # not modify our file). od -t x1 $SCRATCH_MNT/foo2 $XFS_IO_PROG -c "pwrite -S 0xee 0 90" $SCRATCH_MNT/foo2 | _filter_xfs_io # Test cloning the inline extent against a file which has a size of zero # but has a prealloc extent. It should not be possible as well to clone # the inline extent from file bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" $SCRATCH_MNT/foo3 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo3 # Doing IO against any range in the first 4K of the file should work. # Due to a past clone ioctl bug which allowed cloning the inline extent, # these operations resulted in EIO errors. echo "First 50 bytes of foo3 after clone operation:" # Should not be able to read any bytes, file has 0 bytes i_size (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo3 $XFS_IO_PROG -c "pwrite -S 0xff 0 90" $SCRATCH_MNT/foo3 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size not greater than the size of # bar's inline extent (40 < 50). # It should be possible to do the extent cloning from bar to this file. $XFS_IO_PROG -f -c "pwrite -S 0x01 0 40" $SCRATCH_MNT/foo4 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo4 # Doing IO against any range in the first 4K of the file should work. echo "File foo4 data after clone operation:" # Must match file bar's content. od -t x1 $SCRATCH_MNT/foo4 $XFS_IO_PROG -c "pwrite -S 0x02 0 90" $SCRATCH_MNT/foo4 | _filter_xfs_io # Test cloning the inline extent against a file which consists of a # single inline extent that has a size greater than the size of bar's # inline extent (60 > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "pwrite -S 0x03 0 60" $SCRATCH_MNT/foo5 \ | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo5 # Reading the file should not fail. echo "File foo5 data after clone operation:" # Must have a size of 60 bytes, with all bytes having a value of 0x03 # (the clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo5 # Test cloning the inline extent against a file which has no extents but # has a size greater than bar's inline extent (16K > 50). # It should not be possible to clone the inline extent from file bar # into this file. $XFS_IO_PROG -f -c "truncate 16K" $SCRATCH_MNT/foo6 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo6 # Reading the file should not fail. echo "File foo6 data after clone operation:" # Must have a size of 16K, with all bytes having a value of 0x00 (the # clone operation failed and did not modify our file). od -t x1 $SCRATCH_MNT/foo6 # Test cloning the inline extent against a file which has no extents but # has a size not greater than bar's inline extent (30 < 50). # It should be possible to clone the inline extent from file bar into # this file. $XFS_IO_PROG -f -c "truncate 30" $SCRATCH_MNT/foo7 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo7 # Reading the file should not fail. echo "File foo7 data after clone operation:" # Must have a size of 50 bytes, with all bytes having a value of 0xbb. od -t x1 $SCRATCH_MNT/foo7 # Test cloning the inline extent against a file which has a size not # greater than the size of bar's inline extent (20 < 50) but has # a prealloc extent that goes beyond the file's size. It should not be # possible to clone the inline extent from bar into this file. $XFS_IO_PROG -f -c "falloc -k 0 1M" \ -c "pwrite -S 0x88 0 20" \ $SCRATCH_MNT/foo8 | _filter_xfs_io $CLONER_PROG -s 0 -d 0 -l 0 $SCRATCH_MNT/bar $SCRATCH_MNT/foo8 echo "File foo8 data after clone operation:" # Must have a size of 20 bytes, with all bytes having a value of 0x88 # (the clone operation did not modify our file). od -t x1 $SCRATCH_MNT/foo8 _scratch_unmount } echo -e "\nTesting without compression and without the no-holes feature...\n" test_cloning_inline_extents echo -e "\nTesting with compression and without the no-holes feature...\n" test_cloning_inline_extents "" "-o compress" echo -e "\nTesting without compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "" echo -e "\nTesting with compression and with the no-holes feature...\n" test_cloning_inline_extents "-O no-holes" "-o compress" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com>
2015-10-13 22:15:00 +08:00
return 0;
}
/**
* btrfs_clone() - clone a range from inode file to another
*
* @src: Inode to clone from
* @inode: Inode to clone to
* @off: Offset within source to start clone from
* @olen: Original length, passed by user, of range to clone
* @olen_aligned: Block-aligned value of olen
* @destoff: Offset within @inode to start clone
* @no_time_update: Whether to update mtime/ctime on the target inode
*/
static int btrfs_clone(struct inode *src, struct inode *inode,
const u64 off, const u64 olen, const u64 olen_aligned,
const u64 destoff, int no_time_update)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path = NULL;
struct extent_buffer *leaf;
struct btrfs_trans_handle *trans;
char *buf = NULL;
struct btrfs_key key;
u32 nritems;
int slot;
int ret;
const u64 len = olen_aligned;
u64 last_dest_end = destoff;
ret = -ENOMEM;
treewide: use kv[mz]alloc* rather than opencoded variants There are many code paths opencoding kvmalloc. Let's use the helper instead. The main difference to kvmalloc is that those users are usually not considering all the aspects of the memory allocator. E.g. allocation requests <= 32kB (with 4kB pages) are basically never failing and invoke OOM killer to satisfy the allocation. This sounds too disruptive for something that has a reasonable fallback - the vmalloc. On the other hand those requests might fallback to vmalloc even when the memory allocator would succeed after several more reclaim/compaction attempts previously. There is no guarantee something like that happens though. This patch converts many of those places to kv[mz]alloc* helpers because they are more conservative. Link: http://lkml.kernel.org/r/20170306103327.2766-2-mhocko@kernel.org Signed-off-by: Michal Hocko <mhocko@suse.com> Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com> # Xen bits Acked-by: Kees Cook <keescook@chromium.org> Acked-by: Vlastimil Babka <vbabka@suse.cz> Acked-by: Andreas Dilger <andreas.dilger@intel.com> # Lustre Acked-by: Christian Borntraeger <borntraeger@de.ibm.com> # KVM/s390 Acked-by: Dan Williams <dan.j.williams@intel.com> # nvdim Acked-by: David Sterba <dsterba@suse.com> # btrfs Acked-by: Ilya Dryomov <idryomov@gmail.com> # Ceph Acked-by: Tariq Toukan <tariqt@mellanox.com> # mlx4 Acked-by: Leon Romanovsky <leonro@mellanox.com> # mlx5 Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Herbert Xu <herbert@gondor.apana.org.au> Cc: Anton Vorontsov <anton@enomsg.org> Cc: Colin Cross <ccross@android.com> Cc: Tony Luck <tony.luck@intel.com> Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Kent Overstreet <kent.overstreet@gmail.com> Cc: Santosh Raspatur <santosh@chelsio.com> Cc: Hariprasad S <hariprasad@chelsio.com> Cc: Yishai Hadas <yishaih@mellanox.com> Cc: Oleg Drokin <oleg.drokin@intel.com> Cc: "Yan, Zheng" <zyan@redhat.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexei Starovoitov <ast@kernel.org> Cc: Eric Dumazet <eric.dumazet@gmail.com> Cc: David Miller <davem@davemloft.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-05-09 06:57:27 +08:00
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
if (!buf)
return ret;
path = btrfs_alloc_path();
if (!path) {
kvfree(buf);
return ret;
}
path->reada = READA_FORWARD;
/* clone data */
key.objectid = btrfs_ino(BTRFS_I(src));
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = off;
while (1) {
u64 next_key_min_offset = key.offset + 1;
struct btrfs_file_extent_item *extent;
int type;
u32 size;
struct btrfs_key new_key;
u64 disko = 0, diskl = 0;
u64 datao = 0, datal = 0;
u8 comp;
u64 drop_start;
Btrfs: fix range cloning when same inode used as source and destination While searching for extents to clone we might find one where we only use a part of it coming from its tail. If our destination inode is the same the source inode, we end up removing the tail part of the extent item and insert after a new one that point to the same extent with an adjusted key file offset and data offset. After this we search for the next extent item in the fs/subvol tree with a key that has an offset incremented by one. But this second search leaves us at the new extent item we inserted previously, and since that extent item has a non-zero data offset, it it can make us call btrfs_drop_extents with an empty range (start == end) which causes the following warning: [23978.537119] WARNING: CPU: 6 PID: 16251 at fs/btrfs/file.c:550 btrfs_drop_extent_cache+0x43/0x385 [btrfs]() (...) [23978.557266] Call Trace: [23978.557978] [<ffffffff81425fd9>] dump_stack+0x4c/0x65 [23978.559191] [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb [23978.560699] [<ffffffffa047f0ea>] ? btrfs_drop_extent_cache+0x43/0x385 [btrfs] [23978.562389] [<ffffffff8104544d>] warn_slowpath_null+0x1a/0x1c [23978.563613] [<ffffffffa047f0ea>] btrfs_drop_extent_cache+0x43/0x385 [btrfs] [23978.565103] [<ffffffff810e3a18>] ? time_hardirqs_off+0x15/0x28 [23978.566294] [<ffffffff81079ff8>] ? trace_hardirqs_off+0xd/0xf [23978.567438] [<ffffffffa047f73d>] __btrfs_drop_extents+0x6b/0x9e1 [btrfs] [23978.568702] [<ffffffff8107c03f>] ? trace_hardirqs_on+0xd/0xf [23978.569763] [<ffffffff811441c0>] ? ____cache_alloc+0x69/0x2eb [23978.570817] [<ffffffff81142269>] ? virt_to_head_page+0x9/0x36 [23978.571872] [<ffffffff81143c15>] ? cache_alloc_debugcheck_after.isra.42+0x16c/0x1cb [23978.573466] [<ffffffff811420d5>] ? kmemleak_alloc_recursive.constprop.52+0x16/0x18 [23978.574962] [<ffffffffa0480d07>] btrfs_drop_extents+0x66/0x7f [btrfs] [23978.576179] [<ffffffffa049aa35>] btrfs_clone+0x516/0xaf5 [btrfs] [23978.577311] [<ffffffffa04983dc>] ? lock_extent_range+0x7b/0xcd [btrfs] [23978.578520] [<ffffffffa049b2a2>] btrfs_ioctl_clone+0x28e/0x39f [btrfs] [23978.580282] [<ffffffffa049d9ae>] btrfs_ioctl+0xb51/0x219a [btrfs] (...) [23978.591887] ---[ end trace 988ec2a653d03ed3 ]--- Then we attempt to insert a new extent item with a key that already exists, which makes btrfs_insert_empty_item return -EEXIST resulting in abortion of the current transaction: [23978.594355] WARNING: CPU: 6 PID: 16251 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]() (...) [23978.622589] Call Trace: [23978.623181] [<ffffffff81425fd9>] dump_stack+0x4c/0x65 [23978.624359] [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb [23978.625573] [<ffffffffa044ab6c>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs] [23978.626971] [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48 [23978.628003] [<ffffffff8108a6c8>] ? vprintk_default+0x1d/0x1f [23978.629138] [<ffffffffa044ab6c>] __btrfs_abort_transaction+0x52/0x114 [btrfs] [23978.630528] [<ffffffffa049ad1b>] btrfs_clone+0x7fc/0xaf5 [btrfs] [23978.631635] [<ffffffffa04983dc>] ? lock_extent_range+0x7b/0xcd [btrfs] [23978.632886] [<ffffffffa049b2a2>] btrfs_ioctl_clone+0x28e/0x39f [btrfs] [23978.634119] [<ffffffffa049d9ae>] btrfs_ioctl+0xb51/0x219a [btrfs] (...) [23978.647714] ---[ end trace 988ec2a653d03ed4 ]--- This is wrong because we should not process the extent item that we just inserted previously, and instead process the extent item that follows it in the tree For example for the test case I wrote for fstests: bs=$((64 * 1024)) mkfs.btrfs -f -l $bs -O ^no-holes /dev/sdc mount /dev/sdc /mnt xfs_io -f -c "pwrite -S 0xaa $(($bs * 2)) $(($bs * 2))" /mnt/foo $CLONER_PROG -s $((3 * $bs)) -d $((267 * $bs)) -l 0 /mnt/foo /mnt/foo $CLONER_PROG -s $((217 * $bs)) -d $((95 * $bs)) -l 0 /mnt/foo /mnt/foo The second clone call fails with -EEXIST, because when we process the first extent item (offset 262144), we drop part of it (counting from the end) and then insert a new extent item with a key greater then the key we found. The next time we search the tree we search for a key with offset 262144 + 1, which leaves us at the new extent item we have just inserted but we think it refers to an extent that we need to clone. Fix this by ensuring the next search key uses an offset corresponding to the offset of the key we found previously plus the data length of the corresponding extent item. This ensures we skip new extent items that we inserted and works for the case of implicit holes too (NO_HOLES feature). A test case for fstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-03-31 21:56:46 +08:00
/*
* note the key will change type as we walk through the
* tree.
*/
path->leave_spinning = 1;
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
0, 0);
if (ret < 0)
goto out;
/*
* First search, if no extent item that starts at offset off was
* found but the previous item is an extent item, it's possible
* it might overlap our target range, therefore process it.
*/
if (key.offset == off && ret > 0 && path->slots[0] > 0) {
btrfs_item_key_to_cpu(path->nodes[0], &key,
path->slots[0] - 1);
if (key.type == BTRFS_EXTENT_DATA_KEY)
path->slots[0]--;
}
nritems = btrfs_header_nritems(path->nodes[0]);
process_slot:
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
if (ret < 0)
goto out;
if (ret > 0)
break;
nritems = btrfs_header_nritems(path->nodes[0]);
}
leaf = path->nodes[0];
slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.type > BTRFS_EXTENT_DATA_KEY ||
key.objectid != btrfs_ino(BTRFS_I(src)))
break;
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
extent = btrfs_item_ptr(leaf, slot,
struct btrfs_file_extent_item);
comp = btrfs_file_extent_compression(leaf, extent);
type = btrfs_file_extent_type(leaf, extent);
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
disko = btrfs_file_extent_disk_bytenr(leaf, extent);
diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
datao = btrfs_file_extent_offset(leaf, extent);
datal = btrfs_file_extent_num_bytes(leaf, extent);
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
/* Take upper bound, may be compressed */
datal = btrfs_file_extent_ram_bytes(leaf, extent);
}
/*
* The first search might have left us at an extent item that
* ends before our target range's start, can happen if we have
* holes and NO_HOLES feature enabled.
*/
if (key.offset + datal <= off) {
path->slots[0]++;
goto process_slot;
} else if (key.offset >= off + len) {
break;
}
next_key_min_offset = key.offset + datal;
size = btrfs_item_size_nr(leaf, slot);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
size);
btrfs_release_path(path);
path->leave_spinning = 0;
memcpy(&new_key, &key, sizeof(new_key));
new_key.objectid = btrfs_ino(BTRFS_I(inode));
if (off <= key.offset)
new_key.offset = key.offset + destoff - off;
else
new_key.offset = destoff;
/*
* Deal with a hole that doesn't have an extent item that
* represents it (NO_HOLES feature enabled).
* This hole is either in the middle of the cloning range or at
* the beginning (fully overlaps it or partially overlaps it).
*/
if (new_key.offset != last_dest_end)
drop_start = last_dest_end;
else
drop_start = new_key.offset;
if (type == BTRFS_FILE_EXTENT_REG ||
type == BTRFS_FILE_EXTENT_PREALLOC) {
struct btrfs_clone_extent_info clone_info;
/*
* a | --- range to clone ---| b
* | ------------- extent ------------- |
*/
/* Subtract range b */
if (key.offset + datal > off + len)
datal = off + len - key.offset;
/* Subtract range a */
if (off > key.offset) {
datao += off - key.offset;
datal -= off - key.offset;
}
clone_info.disk_offset = disko;
clone_info.disk_len = diskl;
clone_info.data_offset = datao;
clone_info.data_len = datal;
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.item_size = size;
ret = btrfs_punch_hole_range(inode, path,
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
drop_start,
new_key.offset + datal - 1,
&clone_info, &trans);
if (ret)
goto out;
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
u64 skip = 0;
u64 trim = 0;
if (off > key.offset) {
skip = off - key.offset;
new_key.offset += skip;
}
if (key.offset + datal > off + len)
trim = key.offset + datal - (off + len);
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
if (comp && (skip || trim)) {
ret = -EINVAL;
goto out;
}
size -= skip + trim;
datal -= skip + trim;
/*
* If our extent is inline, we know we will drop or
* adjust at most 1 extent item in the destination root.
*
* 1 - adjusting old extent (we may have to split it)
* 1 - add new extent
* 1 - inode update
*/
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
ret = clone_copy_inline_extent(inode, trans, path,
&new_key, drop_start,
datal, skip, size, buf);
if (ret) {
if (ret != -EOPNOTSUPP)
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
}
btrfs_release_path(path);
last_dest_end = ALIGN(new_key.offset + datal,
fs_info->sectorsize);
ret = clone_finish_inode_update(trans, inode, last_dest_end,
destoff, olen, no_time_update);
if (ret)
goto out;
if (new_key.offset + datal >= destoff + len)
break;
btrfs_release_path(path);
Btrfs: fix range cloning when same inode used as source and destination While searching for extents to clone we might find one where we only use a part of it coming from its tail. If our destination inode is the same the source inode, we end up removing the tail part of the extent item and insert after a new one that point to the same extent with an adjusted key file offset and data offset. After this we search for the next extent item in the fs/subvol tree with a key that has an offset incremented by one. But this second search leaves us at the new extent item we inserted previously, and since that extent item has a non-zero data offset, it it can make us call btrfs_drop_extents with an empty range (start == end) which causes the following warning: [23978.537119] WARNING: CPU: 6 PID: 16251 at fs/btrfs/file.c:550 btrfs_drop_extent_cache+0x43/0x385 [btrfs]() (...) [23978.557266] Call Trace: [23978.557978] [<ffffffff81425fd9>] dump_stack+0x4c/0x65 [23978.559191] [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb [23978.560699] [<ffffffffa047f0ea>] ? btrfs_drop_extent_cache+0x43/0x385 [btrfs] [23978.562389] [<ffffffff8104544d>] warn_slowpath_null+0x1a/0x1c [23978.563613] [<ffffffffa047f0ea>] btrfs_drop_extent_cache+0x43/0x385 [btrfs] [23978.565103] [<ffffffff810e3a18>] ? time_hardirqs_off+0x15/0x28 [23978.566294] [<ffffffff81079ff8>] ? trace_hardirqs_off+0xd/0xf [23978.567438] [<ffffffffa047f73d>] __btrfs_drop_extents+0x6b/0x9e1 [btrfs] [23978.568702] [<ffffffff8107c03f>] ? trace_hardirqs_on+0xd/0xf [23978.569763] [<ffffffff811441c0>] ? ____cache_alloc+0x69/0x2eb [23978.570817] [<ffffffff81142269>] ? virt_to_head_page+0x9/0x36 [23978.571872] [<ffffffff81143c15>] ? cache_alloc_debugcheck_after.isra.42+0x16c/0x1cb [23978.573466] [<ffffffff811420d5>] ? kmemleak_alloc_recursive.constprop.52+0x16/0x18 [23978.574962] [<ffffffffa0480d07>] btrfs_drop_extents+0x66/0x7f [btrfs] [23978.576179] [<ffffffffa049aa35>] btrfs_clone+0x516/0xaf5 [btrfs] [23978.577311] [<ffffffffa04983dc>] ? lock_extent_range+0x7b/0xcd [btrfs] [23978.578520] [<ffffffffa049b2a2>] btrfs_ioctl_clone+0x28e/0x39f [btrfs] [23978.580282] [<ffffffffa049d9ae>] btrfs_ioctl+0xb51/0x219a [btrfs] (...) [23978.591887] ---[ end trace 988ec2a653d03ed3 ]--- Then we attempt to insert a new extent item with a key that already exists, which makes btrfs_insert_empty_item return -EEXIST resulting in abortion of the current transaction: [23978.594355] WARNING: CPU: 6 PID: 16251 at fs/btrfs/super.c:260 __btrfs_abort_transaction+0x52/0x114 [btrfs]() (...) [23978.622589] Call Trace: [23978.623181] [<ffffffff81425fd9>] dump_stack+0x4c/0x65 [23978.624359] [<ffffffff81045390>] warn_slowpath_common+0xa1/0xbb [23978.625573] [<ffffffffa044ab6c>] ? __btrfs_abort_transaction+0x52/0x114 [btrfs] [23978.626971] [<ffffffff810453f0>] warn_slowpath_fmt+0x46/0x48 [23978.628003] [<ffffffff8108a6c8>] ? vprintk_default+0x1d/0x1f [23978.629138] [<ffffffffa044ab6c>] __btrfs_abort_transaction+0x52/0x114 [btrfs] [23978.630528] [<ffffffffa049ad1b>] btrfs_clone+0x7fc/0xaf5 [btrfs] [23978.631635] [<ffffffffa04983dc>] ? lock_extent_range+0x7b/0xcd [btrfs] [23978.632886] [<ffffffffa049b2a2>] btrfs_ioctl_clone+0x28e/0x39f [btrfs] [23978.634119] [<ffffffffa049d9ae>] btrfs_ioctl+0xb51/0x219a [btrfs] (...) [23978.647714] ---[ end trace 988ec2a653d03ed4 ]--- This is wrong because we should not process the extent item that we just inserted previously, and instead process the extent item that follows it in the tree For example for the test case I wrote for fstests: bs=$((64 * 1024)) mkfs.btrfs -f -l $bs -O ^no-holes /dev/sdc mount /dev/sdc /mnt xfs_io -f -c "pwrite -S 0xaa $(($bs * 2)) $(($bs * 2))" /mnt/foo $CLONER_PROG -s $((3 * $bs)) -d $((267 * $bs)) -l 0 /mnt/foo /mnt/foo $CLONER_PROG -s $((217 * $bs)) -d $((95 * $bs)) -l 0 /mnt/foo /mnt/foo The second clone call fails with -EEXIST, because when we process the first extent item (offset 262144), we drop part of it (counting from the end) and then insert a new extent item with a key greater then the key we found. The next time we search the tree we search for a key with offset 262144 + 1, which leaves us at the new extent item we have just inserted but we think it refers to an extent that we need to clone. Fix this by ensuring the next search key uses an offset corresponding to the offset of the key we found previously plus the data length of the corresponding extent item. This ensures we skip new extent items that we inserted and works for the case of implicit holes too (NO_HOLES feature). A test case for fstests follows soon. Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Chris Mason <clm@fb.com>
2015-03-31 21:56:46 +08:00
key.offset = next_key_min_offset;
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto out;
}
}
ret = 0;
if (last_dest_end < destoff + len) {
/*
Btrfs: fix hole extent items with a zero size after range cloning Normally when cloning a file range if we find an implicit hole at the end of the range we assume it is because the NO_HOLES feature is enabled. However that is not always the case. One well known case [1] is when we have a power failure after mixing buffered and direct IO writes against the same file. In such cases we need to punch a hole in the destination file, and if the NO_HOLES feature is not enabled, we need to insert explicit file extent items to represent the hole. After commit 690a5dbfc51315 ("Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents"), we started to insert file extent items representing the hole with an item size of 0, which is invalid and should be 53 bytes (the size of a btrfs_file_extent_item structure), resulting in all sorts of corruptions and invalid memory accesses. This is detected by the tree checker when we attempt to write a leaf to disk. The problem can be sporadically triggered by test case generic/561 from fstests. That test case does not exercise power failure and creates a new filesystem when it starts, so it does not use a filesystem created by any previous test that tests power failure. However the test does both buffered and direct IO writes (through fsstress) and it's precisely that which is creating the implicit holes in files. That happens even before the commit mentioned earlier. I need to investigate why we get those implicit holes to check if there is a real problem or not. For now this change fixes the regression of introducing file extent items with an item size of 0 bytes. Fix the issue by calling btrfs_punch_hole_range() without passing a btrfs_clone_extent_info structure, which ensures file extent items are inserted to represent the hole with a correct item size. We were passing a btrfs_clone_extent_info with a value of 0 for its 'item_size' field, which was causing the insertion of file extent items with an item size of 0. [1] https://www.spinics.net/lists/linux-btrfs/msg75350.html Reported-by: David Sterba <dsterba@suse.com> Fixes: 690a5dbfc51315 ("Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents") Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-12-06 00:58:41 +08:00
* We have an implicit hole that fully or partially overlaps our
* cloning range at its end. This means that we either have the
* NO_HOLES feature enabled or the implicit hole happened due to
* mixing buffered and direct IO writes against this file.
*/
btrfs_release_path(path);
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
path->leave_spinning = 0;
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
ret = btrfs_punch_hole_range(inode, path,
last_dest_end, destoff + len - 1,
Btrfs: fix hole extent items with a zero size after range cloning Normally when cloning a file range if we find an implicit hole at the end of the range we assume it is because the NO_HOLES feature is enabled. However that is not always the case. One well known case [1] is when we have a power failure after mixing buffered and direct IO writes against the same file. In such cases we need to punch a hole in the destination file, and if the NO_HOLES feature is not enabled, we need to insert explicit file extent items to represent the hole. After commit 690a5dbfc51315 ("Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents"), we started to insert file extent items representing the hole with an item size of 0, which is invalid and should be 53 bytes (the size of a btrfs_file_extent_item structure), resulting in all sorts of corruptions and invalid memory accesses. This is detected by the tree checker when we attempt to write a leaf to disk. The problem can be sporadically triggered by test case generic/561 from fstests. That test case does not exercise power failure and creates a new filesystem when it starts, so it does not use a filesystem created by any previous test that tests power failure. However the test does both buffered and direct IO writes (through fsstress) and it's precisely that which is creating the implicit holes in files. That happens even before the commit mentioned earlier. I need to investigate why we get those implicit holes to check if there is a real problem or not. For now this change fixes the regression of introducing file extent items with an item size of 0 bytes. Fix the issue by calling btrfs_punch_hole_range() without passing a btrfs_clone_extent_info structure, which ensures file extent items are inserted to represent the hole with a correct item size. We were passing a btrfs_clone_extent_info with a value of 0 for its 'item_size' field, which was causing the insertion of file extent items with an item size of 0. [1] https://www.spinics.net/lists/linux-btrfs/msg75350.html Reported-by: David Sterba <dsterba@suse.com> Fixes: 690a5dbfc51315 ("Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents") Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Josef Bacik <josef@toxicpanda.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-12-06 00:58:41 +08:00
NULL, &trans);
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
if (ret)
goto out;
Btrfs: fix ENOSPC errors, leading to transaction aborts, when cloning extents When cloning extents (or deduplicating) we create a transaction with a space reservation that considers we will drop or update a single file extent item of the destination inode (that we modify a single leaf). That is fine for the vast majority of scenarios, however it might happen that we need to drop many file extent items, and adjust at most two file extent items, in the destination root, which can span multiple leafs. This will lead to either the call to btrfs_drop_extents() to fail with ENOSPC or the subsequent calls to btrfs_insert_empty_item() or btrfs_update_inode() (called through clone_finish_inode_update()) to fail with ENOSPC. Such failure results in a transaction abort, leaving the filesystem in a read-only mode. In order to fix this we need to follow the same approach as the hole punching code, where we create a local reservation with 1 unit and keep ending and starting transactions, after balancing the btree inode, when __btrfs_drop_extents() returns ENOSPC. So fix this by making the extent cloning call calls the recently added btrfs_punch_hole_range() helper, which is what does the mentioned work for hole punching, and make sure whenever we drop extent items in a transaction, we also add a replacing file extent item, to avoid corruption (a hole) if after ending a transaction and before starting a new one, the old transaction gets committed and a power failure happens before we finish cloning. A test case for fstests follows soon. Reported-by: David Goodwin <david@codepoets.co.uk> Link: https://lore.kernel.org/linux-btrfs/a4a4cf31-9cf4-e52c-1f86-c62d336c9cd1@codepoets.co.uk/ Reported-by: Sam Tygier <sam@tygier.co.uk> Link: https://lore.kernel.org/linux-btrfs/82aace9f-a1e3-1f0b-055f-3ea75f7a41a0@tygier.co.uk/ Fixes: b6f3409b2197e8f ("Btrfs: reserve sufficient space for ioctl clone") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-07-05 18:09:50 +08:00
ret = clone_finish_inode_update(trans, inode, destoff + len,
destoff, olen, no_time_update);
}
out:
btrfs_free_path(path);
kvfree(buf);
return ret;
}
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
u64 off, u64 olen, u64 destoff)
{
struct inode *inode = file_inode(file);
struct inode *src = file_inode(file_src);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
int ret;
u64 len = olen;
u64 bs = fs_info->sb->s_blocksize;
/*
* TODO:
* - split compressed inline extents. annoying: we need to
* decompress into destination's address_space (the file offset
* may change, so source mapping won't do), then recompress (or
* otherwise reinsert) a subrange.
*
* - split destination inode's inline extents. The inline extents can
* be either compressed or non-compressed.
*/
Btrfs: fix data corruption due to cloning of eof block We currently allow cloning a range from a file which includes the last block of the file even if the file's size is not aligned to the block size. This is fine and useful when the destination file has the same size, but when it does not and the range ends somewhere in the middle of the destination file, it leads to corruption because the bytes between the EOF and the end of the block have undefined data (when there is support for discard/trimming they have a value of 0x00). Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ export foo_size=$((256 * 1024 + 100)) $ xfs_io -f -c "pwrite -S 0x3c 0 $foo_size" /mnt/foo $ xfs_io -f -c "pwrite -S 0xb5 0 1M" /mnt/bar $ xfs_io -c "reflink /mnt/foo 0 512K $foo_size" /mnt/bar $ od -A d -t x1 /mnt/bar 0000000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 * 0524288 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c * 0786528 3c 3c 3c 3c 00 00 00 00 00 00 00 00 00 00 00 00 0786544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0790528 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 * 1048576 The bytes in the range from 786532 (512Kb + 256Kb + 100 bytes) to 790527 (512Kb + 256Kb + 4Kb - 1) got corrupted, having now a value of 0x00 instead of 0xb5. This is similar to the problem we had for deduplication that got recently fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when deduplicating between different files"). Fix this by not allowing such operations to be performed and return the errno -EINVAL to user space. This is what XFS is doing as well at the VFS level. This change however now makes us return -EINVAL instead of -EOPNOTSUPP for cases where the source range maps to an inline extent and the destination range's end is smaller then the destination file's size, since the detection of inline extents is done during the actual process of dropping file extent items (at __btrfs_drop_extents()). Returning the -EINVAL error is done early on and solely based on the input parameters (offsets and length) and destination file's size. This makes us consistent with XFS and anyone else supporting cloning since this case is now checked at a higher level in the VFS and is where the -EINVAL will be returned from starting with kernel 4.20 (the VFS changed was introduced in 4.20-rc1 by commit 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF block"). So this change is more geared towards stable kernels, as it's unlikely the new VFS checks get removed intentionally. A test case for fstests follows soon, as well as an update to filter existing tests that expect -EOPNOTSUPP to accept -EINVAL as well. CC: <stable@vger.kernel.org> # 4.4+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-05 19:14:17 +08:00
/*
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
* VFS's generic_remap_file_range_prep() protects us from cloning the
* eof block into the middle of a file, which would result in corruption
* if the file size is not blocksize aligned. So we don't need to check
* for that case here.
Btrfs: fix data corruption due to cloning of eof block We currently allow cloning a range from a file which includes the last block of the file even if the file's size is not aligned to the block size. This is fine and useful when the destination file has the same size, but when it does not and the range ends somewhere in the middle of the destination file, it leads to corruption because the bytes between the EOF and the end of the block have undefined data (when there is support for discard/trimming they have a value of 0x00). Example: $ mkfs.btrfs -f /dev/sdb $ mount /dev/sdb /mnt $ export foo_size=$((256 * 1024 + 100)) $ xfs_io -f -c "pwrite -S 0x3c 0 $foo_size" /mnt/foo $ xfs_io -f -c "pwrite -S 0xb5 0 1M" /mnt/bar $ xfs_io -c "reflink /mnt/foo 0 512K $foo_size" /mnt/bar $ od -A d -t x1 /mnt/bar 0000000 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 * 0524288 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c 3c * 0786528 3c 3c 3c 3c 00 00 00 00 00 00 00 00 00 00 00 00 0786544 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0790528 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 b5 * 1048576 The bytes in the range from 786532 (512Kb + 256Kb + 100 bytes) to 790527 (512Kb + 256Kb + 4Kb - 1) got corrupted, having now a value of 0x00 instead of 0xb5. This is similar to the problem we had for deduplication that got recently fixed by commit de02b9f6bb65 ("Btrfs: fix data corruption when deduplicating between different files"). Fix this by not allowing such operations to be performed and return the errno -EINVAL to user space. This is what XFS is doing as well at the VFS level. This change however now makes us return -EINVAL instead of -EOPNOTSUPP for cases where the source range maps to an inline extent and the destination range's end is smaller then the destination file's size, since the detection of inline extents is done during the actual process of dropping file extent items (at __btrfs_drop_extents()). Returning the -EINVAL error is done early on and solely based on the input parameters (offsets and length) and destination file's size. This makes us consistent with XFS and anyone else supporting cloning since this case is now checked at a higher level in the VFS and is where the -EINVAL will be returned from starting with kernel 4.20 (the VFS changed was introduced in 4.20-rc1 by commit 07d19dc9fbe9 ("vfs: avoid problematic remapping requests into partial EOF block"). So this change is more geared towards stable kernels, as it's unlikely the new VFS checks get removed intentionally. A test case for fstests follows soon, as well as an update to filter existing tests that expect -EOPNOTSUPP to accept -EINVAL as well. CC: <stable@vger.kernel.org> # 4.4+ Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-11-05 19:14:17 +08:00
*/
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
if (off + len == src->i_size)
len = ALIGN(src->i_size, bs) - off;
if (destoff > inode->i_size) {
Btrfs: fix race between cloning range ending at eof and writeback The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between writeback and cloning a range that covers the eof extent of the source file into a destination offset that is greater then the same file's size. This happens because we now wait for writeback to complete before doing the truncation of the eof block, while previously we did the truncation and then waited for writeback to complete. This leads to a race between writeback of the truncated block and cloning the file extents in the source range, because we copy each file extent item we find in the fs root into a buffer, then release the path and then increment the reference count for the extent referred in that file extent item we copied, which can no longer exist if writeback of the truncated eof block completes after we copied the file extent item into the buffer and before we incremented the reference count. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_clone_files() btrfs_cont_expand() btrfs_truncate_block() --> zeroes part of the page containg eof, marking it for delalloc btrfs_clone() --> finds extent item covering eof, points to extent at bytenr X --> copies it into a local buffer --> releases path writeback starts btrfs_finish_ordered_io() insert_reserved_file_extent() __btrfs_drop_extents() --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by waiting for writeback to complete after truncating the eof block. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:42:54 +08:00
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
if (ret)
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
return ret;
Btrfs: fix race between cloning range ending at eof and writeback The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between writeback and cloning a range that covers the eof extent of the source file into a destination offset that is greater then the same file's size. This happens because we now wait for writeback to complete before doing the truncation of the eof block, while previously we did the truncation and then waited for writeback to complete. This leads to a race between writeback of the truncated block and cloning the file extents in the source range, because we copy each file extent item we find in the fs root into a buffer, then release the path and then increment the reference count for the extent referred in that file extent item we copied, which can no longer exist if writeback of the truncated eof block completes after we copied the file extent item into the buffer and before we incremented the reference count. This is illustrated by the following diagram: CPU 1 CPU 2 btrfs_clone_files() btrfs_cont_expand() btrfs_truncate_block() --> zeroes part of the page containg eof, marking it for delalloc btrfs_clone() --> finds extent item covering eof, points to extent at bytenr X --> copies it into a local buffer --> releases path writeback starts btrfs_finish_ordered_io() insert_reserved_file_extent() __btrfs_drop_extents() --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by waiting for writeback to complete after truncating the eof block. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:42:54 +08:00
/*
* We may have truncated the last block if the inode's size is
* not sector size aligned, so we need to wait for writeback to
* complete before proceeding further, otherwise we can race
* with cloning and attempt to increment a reference to an
* extent that no longer exists (writeback completed right after
* we found the previous extent covering eof and before we
* attempted to increment its reference count).
*/
ret = btrfs_wait_ordered_range(inode, wb_start,
destoff - wb_start);
if (ret)
return ret;
}
Btrfs: ensure readers see new data after a clone operation We were cleaning the clone target file range from the page cache before we did replace the file extent items in the fs tree. This was racy, as right after cleaning the relevant range from the page cache and before replacing the file extent items, a read against that range could be performed by another task and populate again the page cache with stale data (stale after the cloning finishes). This would result in reads after the clone operation successfully finishes to get old data (and potentially for a very long time). Therefore evict the pages after replacing the file extent items, so that subsequent reads will always get the new data. Similarly, we were prone to races while cloning the file extent items because we weren't locking the target range and wait for any existing ordered extents against that range to complete. It was possible that after cloning the extent items, a write operation that was performed before the clone operation and overlaps the same range, would end up undoing all or part of the work the clone operation did (a worker task running inode.c:btrfs_finish_ordered_io). Therefore lock the target range in the io tree, wait for all pending ordered extents against that range to finish and then safely perform the cloning. The issue of reading stale data after the clone operation is easy to reproduce by running the following C program in a loop until it exits with return value 1. #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <pthread.h> #include <fcntl.h> #include <assert.h> #include <asm/types.h> #include <linux/ioctl.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/ioctl.h> #define SRC_FILE "/mnt/sdd/foo" #define DST_FILE "/mnt/sdd/bar" #define FILE_SIZE (16 * 1024) #define PATTERN_SRC 'X' #define PATTERN_DST 'Y' struct btrfs_ioctl_clone_range_args { __s64 src_fd; __u64 src_offset, src_length; __u64 dest_offset; }; #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ struct btrfs_ioctl_clone_range_args) static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; static int clone_done = 0; static int reader_ready = 0; static int stale_data = 0; static void *reader_loop(void *arg) { char buf[4096], want_buf[4096]; memset(want_buf, PATTERN_SRC, 4096); pthread_mutex_lock(&mutex); reader_ready = 1; pthread_mutex_unlock(&mutex); while (1) { int done, fd, ret; fd = open(DST_FILE, O_RDONLY); assert(fd != -1); pthread_mutex_lock(&mutex); done = clone_done; pthread_mutex_unlock(&mutex); ret = read(fd, buf, 4096); assert(ret == 4096); close(fd); if (done) { ret = memcmp(buf, want_buf, 4096); if (ret == 0) { printf("Found new content\n"); } else { printf("Found old content\n"); pthread_mutex_lock(&mutex); stale_data = 1; pthread_mutex_unlock(&mutex); } break; } } return NULL; } int main(int argc, char *argv[]) { pthread_t reader; int ret, i, fd; struct btrfs_ioctl_clone_range_args clone_args; int fd1, fd2; ret = remove(SRC_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting src file: %s\n", strerror(errno)); return 1; } ret = remove(DST_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno)); return 1; } fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_SRC; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_DST; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); sync(); ret = pthread_create(&reader, NULL, reader_loop, NULL); assert(ret == 0); while (1) { int r; pthread_mutex_lock(&mutex); r = reader_ready; pthread_mutex_unlock(&mutex); if (r) break; } fd1 = open(SRC_FILE, O_RDONLY); if (fd1 < 0) { fprintf(stderr, "Error open src file: %s\n", strerror(errno)); return 1; } fd2 = open(DST_FILE, O_RDWR); if (fd2 < 0) { fprintf(stderr, "Error open dst file: %s\n", strerror(errno)); return 1; } clone_args.src_fd = fd1; clone_args.src_offset = 0; clone_args.src_length = 4096; clone_args.dest_offset = 0; ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args); assert(ret == 0); close(fd1); close(fd2); pthread_mutex_lock(&mutex); clone_done = 1; pthread_mutex_unlock(&mutex); ret = pthread_join(reader, NULL); assert(ret == 0); pthread_mutex_lock(&mutex); ret = stale_data ? 1 : 0; pthread_mutex_unlock(&mutex); return ret; } Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-05-23 12:03:34 +08:00
/*
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
* Lock destination range to serialize with concurrent readpages() and
* source range to serialize with relocation.
Btrfs: ensure readers see new data after a clone operation We were cleaning the clone target file range from the page cache before we did replace the file extent items in the fs tree. This was racy, as right after cleaning the relevant range from the page cache and before replacing the file extent items, a read against that range could be performed by another task and populate again the page cache with stale data (stale after the cloning finishes). This would result in reads after the clone operation successfully finishes to get old data (and potentially for a very long time). Therefore evict the pages after replacing the file extent items, so that subsequent reads will always get the new data. Similarly, we were prone to races while cloning the file extent items because we weren't locking the target range and wait for any existing ordered extents against that range to complete. It was possible that after cloning the extent items, a write operation that was performed before the clone operation and overlaps the same range, would end up undoing all or part of the work the clone operation did (a worker task running inode.c:btrfs_finish_ordered_io). Therefore lock the target range in the io tree, wait for all pending ordered extents against that range to finish and then safely perform the cloning. The issue of reading stale data after the clone operation is easy to reproduce by running the following C program in a loop until it exits with return value 1. #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <pthread.h> #include <fcntl.h> #include <assert.h> #include <asm/types.h> #include <linux/ioctl.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/ioctl.h> #define SRC_FILE "/mnt/sdd/foo" #define DST_FILE "/mnt/sdd/bar" #define FILE_SIZE (16 * 1024) #define PATTERN_SRC 'X' #define PATTERN_DST 'Y' struct btrfs_ioctl_clone_range_args { __s64 src_fd; __u64 src_offset, src_length; __u64 dest_offset; }; #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ struct btrfs_ioctl_clone_range_args) static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; static int clone_done = 0; static int reader_ready = 0; static int stale_data = 0; static void *reader_loop(void *arg) { char buf[4096], want_buf[4096]; memset(want_buf, PATTERN_SRC, 4096); pthread_mutex_lock(&mutex); reader_ready = 1; pthread_mutex_unlock(&mutex); while (1) { int done, fd, ret; fd = open(DST_FILE, O_RDONLY); assert(fd != -1); pthread_mutex_lock(&mutex); done = clone_done; pthread_mutex_unlock(&mutex); ret = read(fd, buf, 4096); assert(ret == 4096); close(fd); if (done) { ret = memcmp(buf, want_buf, 4096); if (ret == 0) { printf("Found new content\n"); } else { printf("Found old content\n"); pthread_mutex_lock(&mutex); stale_data = 1; pthread_mutex_unlock(&mutex); } break; } } return NULL; } int main(int argc, char *argv[]) { pthread_t reader; int ret, i, fd; struct btrfs_ioctl_clone_range_args clone_args; int fd1, fd2; ret = remove(SRC_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting src file: %s\n", strerror(errno)); return 1; } ret = remove(DST_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno)); return 1; } fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_SRC; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_DST; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); sync(); ret = pthread_create(&reader, NULL, reader_loop, NULL); assert(ret == 0); while (1) { int r; pthread_mutex_lock(&mutex); r = reader_ready; pthread_mutex_unlock(&mutex); if (r) break; } fd1 = open(SRC_FILE, O_RDONLY); if (fd1 < 0) { fprintf(stderr, "Error open src file: %s\n", strerror(errno)); return 1; } fd2 = open(DST_FILE, O_RDWR); if (fd2 < 0) { fprintf(stderr, "Error open dst file: %s\n", strerror(errno)); return 1; } clone_args.src_fd = fd1; clone_args.src_offset = 0; clone_args.src_length = 4096; clone_args.dest_offset = 0; ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args); assert(ret == 0); close(fd1); close(fd2); pthread_mutex_lock(&mutex); clone_done = 1; pthread_mutex_unlock(&mutex); ret = pthread_join(reader, NULL); assert(ret == 0); pthread_mutex_lock(&mutex); ret = stale_data ? 1 : 0; pthread_mutex_unlock(&mutex); return ret; } Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-05-23 12:03:34 +08:00
*/
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
btrfs_double_extent_lock(src, off, inode, destoff, len);
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
Btrfs: fix race between reflink/dedupe and relocation The recent rework that makes btrfs' remap_file_range operation use the generic helper generic_remap_file_range_prep() introduced a race between relocation and reflinking (for both cloning and deduplication) the file extents between the source and destination inodes. This happens because we no longer lock the source range anymore, and we do not lock it anymore because we wait for direct IO writes and writeback to complete early on the code path right after locking the inodes, which guarantees no other file operations interfere with the reflinking. However there is one exception which is relocation, since it replaces the byte number of file extents items in the fs tree after locking the range the file extent items represent. This is a problem because after finding each file extent to clone in the fs tree, the reflink process copies the file extent item into a local buffer, releases the search path, inserts new file extent items in the destination range and then increments the reference count for the extent mentioned in the file extent item that it previously copied to the buffer. If right after copying the file extent item into the buffer and releasing the path the relocation process updates the file extent item to point to the new extent, the reflink process ends up creating a delayed reference to increment the reference count of the old extent, for which the relocation process already created a delayed reference to drop it. This results in failure to run delayed references because we will attempt to increment the count of a reference that was already dropped. This is illustrated by the following diagram: CPU 1 CPU 2 relocation is running btrfs_clone_files() btrfs_clone() --> finds extent item in source range point to extent at bytenr X --> copies it into a local buffer --> releases path replace_file_extents() --> successfully locks the range represented by the file extent item --> replaces disk_bytenr field in the file extent item with some other value Y --> creates delayed reference to increment reference count for extent at bytenr Y --> creates delayed reference to drop the extent at bytenr X --> starts transaction --> creates delayed reference to increment extent at bytenr X <delayed references are run, due to a transaction commit for example, and the transaction is aborted with -EIO because we attempt to increment reference count for the extent at bytenr X after we freed it> When this race is hit the running transaction ends up getting aborted with an -EIO error and a trace like the following is produced: [ 4382.553858] WARNING: CPU: 2 PID: 3648 at fs/btrfs/extent-tree.c:1552 lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556293] CPU: 2 PID: 3648 Comm: btrfs Tainted: G W 4.20.0-rc6-btrfs-next-41 #1 [ 4382.556294] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.2-0-gf9626ccb91-prebuilt.qemu-project.org 04/01/2014 [ 4382.556308] RIP: 0010:lookup_inline_extent_backref+0x4f4/0x650 [btrfs] (...) [ 4382.556310] RSP: 0018:ffffac784408f738 EFLAGS: 00010202 [ 4382.556311] RAX: 0000000000000001 RBX: ffff8980673c3a48 RCX: 0000000000000001 [ 4382.556312] RDX: 0000000000000008 RSI: 0000000000000000 RDI: 0000000000000000 [ 4382.556312] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000001 [ 4382.556313] R10: 0000000000000001 R11: ffff897f40000000 R12: 0000000000001000 [ 4382.556313] R13: 00000000c224f000 R14: ffff89805de9bd40 R15: ffff8980453f4548 [ 4382.556315] FS: 00007f5e759178c0(0000) GS:ffff89807b300000(0000) knlGS:0000000000000000 [ 4382.563130] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 4382.563562] CR2: 00007f2e9789fcbc CR3: 0000000120512001 CR4: 00000000003606e0 [ 4382.564005] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 4382.564451] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 4382.564887] Call Trace: [ 4382.565343] insert_inline_extent_backref+0x55/0xe0 [btrfs] [ 4382.565796] __btrfs_inc_extent_ref.isra.60+0x88/0x260 [btrfs] [ 4382.566249] ? __btrfs_run_delayed_refs+0x93/0x1650 [btrfs] [ 4382.566702] __btrfs_run_delayed_refs+0xa22/0x1650 [btrfs] [ 4382.567162] btrfs_run_delayed_refs+0x7e/0x1d0 [btrfs] [ 4382.567623] btrfs_commit_transaction+0x50/0x9c0 [btrfs] [ 4382.568112] ? _raw_spin_unlock+0x24/0x30 [ 4382.568557] ? block_rsv_release_bytes+0x14e/0x410 [btrfs] [ 4382.569006] create_subvol+0x3c8/0x830 [btrfs] [ 4382.569461] ? btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.569906] btrfs_mksubvol+0x317/0x600 [btrfs] [ 4382.570383] ? rcu_sync_lockdep_assert+0xe/0x60 [ 4382.570822] ? __sb_start_write+0xd4/0x1c0 [ 4382.571262] ? mnt_want_write_file+0x24/0x50 [ 4382.571712] btrfs_ioctl_snap_create_transid+0x117/0x1a0 [btrfs] [ 4382.572155] ? _copy_from_user+0x66/0x90 [ 4382.572602] btrfs_ioctl_snap_create+0x66/0x80 [btrfs] [ 4382.573052] btrfs_ioctl+0x7c1/0x30e0 [btrfs] [ 4382.573502] ? mem_cgroup_commit_charge+0x8b/0x570 [ 4382.573946] ? do_raw_spin_unlock+0x49/0xc0 [ 4382.574379] ? _raw_spin_unlock+0x24/0x30 [ 4382.574803] ? __handle_mm_fault+0xf29/0x12d0 [ 4382.575215] ? do_vfs_ioctl+0xa2/0x6f0 [ 4382.575622] ? btrfs_ioctl_get_supported_features+0x30/0x30 [btrfs] [ 4382.576020] do_vfs_ioctl+0xa2/0x6f0 [ 4382.576405] ksys_ioctl+0x70/0x80 [ 4382.576776] __x64_sys_ioctl+0x16/0x20 [ 4382.577137] do_syscall_64+0x60/0x1b0 [ 4382.577488] entry_SYSCALL_64_after_hwframe+0x49/0xbe (...) [ 4382.578837] RSP: 002b:00007ffe04bf64c8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010 [ 4382.579174] RAX: ffffffffffffffda RBX: 00005564136f3050 RCX: 00007f5e74724dd7 [ 4382.579505] RDX: 00007ffe04bf64d0 RSI: 000000005000940e RDI: 0000000000000003 [ 4382.579848] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000044 [ 4382.580164] R10: 0000000000000541 R11: 0000000000000202 R12: 00005564136f3010 [ 4382.580477] R13: 0000000000000003 R14: 00005564136f3035 R15: 00005564136f3050 [ 4382.580792] irq event stamp: 0 [ 4382.581106] hardirqs last enabled at (0): [<0000000000000000>] (null) [ 4382.581441] hardirqs last disabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.581772] softirqs last enabled at (0): [<ffffffff8d085842>] copy_process.part.32+0x6e2/0x2320 [ 4382.582095] softirqs last disabled at (0): [<0000000000000000>] (null) [ 4382.582413] ---[ end trace d3c188e3e9367382 ]--- [ 4382.623855] BTRFS: error (device sdc) in btrfs_run_delayed_refs:2981: errno=-5 IO failure [ 4382.624295] BTRFS info (device sdc): forced readonly Fix this by locking the source range before searching for the file extent items in the fs tree, since the relocation process will try to lock the range a file extent item represents before updating it with the new extent location. Fixes: 34a28e3d7753 ("Btrfs: use generic_remap_file_range_prep() for cloning and deduplication") Signed-off-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-01-08 19:43:07 +08:00
btrfs_double_extent_unlock(src, off, inode, destoff, len);
Btrfs: ensure readers see new data after a clone operation We were cleaning the clone target file range from the page cache before we did replace the file extent items in the fs tree. This was racy, as right after cleaning the relevant range from the page cache and before replacing the file extent items, a read against that range could be performed by another task and populate again the page cache with stale data (stale after the cloning finishes). This would result in reads after the clone operation successfully finishes to get old data (and potentially for a very long time). Therefore evict the pages after replacing the file extent items, so that subsequent reads will always get the new data. Similarly, we were prone to races while cloning the file extent items because we weren't locking the target range and wait for any existing ordered extents against that range to complete. It was possible that after cloning the extent items, a write operation that was performed before the clone operation and overlaps the same range, would end up undoing all or part of the work the clone operation did (a worker task running inode.c:btrfs_finish_ordered_io). Therefore lock the target range in the io tree, wait for all pending ordered extents against that range to finish and then safely perform the cloning. The issue of reading stale data after the clone operation is easy to reproduce by running the following C program in a loop until it exits with return value 1. #include <unistd.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <errno.h> #include <pthread.h> #include <fcntl.h> #include <assert.h> #include <asm/types.h> #include <linux/ioctl.h> #include <sys/stat.h> #include <sys/types.h> #include <sys/ioctl.h> #define SRC_FILE "/mnt/sdd/foo" #define DST_FILE "/mnt/sdd/bar" #define FILE_SIZE (16 * 1024) #define PATTERN_SRC 'X' #define PATTERN_DST 'Y' struct btrfs_ioctl_clone_range_args { __s64 src_fd; __u64 src_offset, src_length; __u64 dest_offset; }; #define BTRFS_IOCTL_MAGIC 0x94 #define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ struct btrfs_ioctl_clone_range_args) static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; static int clone_done = 0; static int reader_ready = 0; static int stale_data = 0; static void *reader_loop(void *arg) { char buf[4096], want_buf[4096]; memset(want_buf, PATTERN_SRC, 4096); pthread_mutex_lock(&mutex); reader_ready = 1; pthread_mutex_unlock(&mutex); while (1) { int done, fd, ret; fd = open(DST_FILE, O_RDONLY); assert(fd != -1); pthread_mutex_lock(&mutex); done = clone_done; pthread_mutex_unlock(&mutex); ret = read(fd, buf, 4096); assert(ret == 4096); close(fd); if (done) { ret = memcmp(buf, want_buf, 4096); if (ret == 0) { printf("Found new content\n"); } else { printf("Found old content\n"); pthread_mutex_lock(&mutex); stale_data = 1; pthread_mutex_unlock(&mutex); } break; } } return NULL; } int main(int argc, char *argv[]) { pthread_t reader; int ret, i, fd; struct btrfs_ioctl_clone_range_args clone_args; int fd1, fd2; ret = remove(SRC_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting src file: %s\n", strerror(errno)); return 1; } ret = remove(DST_FILE); if (ret == -1 && errno != ENOENT) { fprintf(stderr, "Error deleting dst file: %s\n", strerror(errno)); return 1; } fd = open(SRC_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_SRC; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); fd = open(DST_FILE, O_CREAT | O_WRONLY | O_TRUNC, S_IRWXU); assert(fd != -1); for (i = 0; i < FILE_SIZE; i++) { char c = PATTERN_DST; ret = write(fd, &c, 1); assert(ret == 1); } close(fd); sync(); ret = pthread_create(&reader, NULL, reader_loop, NULL); assert(ret == 0); while (1) { int r; pthread_mutex_lock(&mutex); r = reader_ready; pthread_mutex_unlock(&mutex); if (r) break; } fd1 = open(SRC_FILE, O_RDONLY); if (fd1 < 0) { fprintf(stderr, "Error open src file: %s\n", strerror(errno)); return 1; } fd2 = open(DST_FILE, O_RDWR); if (fd2 < 0) { fprintf(stderr, "Error open dst file: %s\n", strerror(errno)); return 1; } clone_args.src_fd = fd1; clone_args.src_offset = 0; clone_args.src_length = 4096; clone_args.dest_offset = 0; ret = ioctl(fd2, BTRFS_IOC_CLONE_RANGE, &clone_args); assert(ret == 0); close(fd1); close(fd2); pthread_mutex_lock(&mutex); clone_done = 1; pthread_mutex_unlock(&mutex); ret = pthread_join(reader, NULL); assert(ret == 0); pthread_mutex_lock(&mutex); ret = stale_data ? 1 : 0; pthread_mutex_unlock(&mutex); return ret; } Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Signed-off-by: Chris Mason <clm@fb.com>
2014-05-23 12:03:34 +08:00
/*
* Truncate page cache pages so that future reads will see the cloned
* data immediately and not the previous data.
*/
truncate_inode_pages_range(&inode->i_data,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
round_down(destoff, PAGE_SIZE),
round_up(destoff + len, PAGE_SIZE) - 1);
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
return ret;
}
static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
if (!(remap_flags & REMAP_FILE_DEDUP)) {
struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
if (btrfs_root_readonly(root_out))
return -EROFS;
if (file_in->f_path.mnt != file_out->f_path.mnt ||
inode_in->i_sb != inode_out->i_sb)
return -EXDEV;
}
/* don't make the dst file partly checksummed */
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
return -EINVAL;
}
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
/*
* Now that the inodes are locked, we need to start writeback ourselves
* and can not rely on the writeback from the VFS's generic helper
* generic_remap_file_range_prep() because:
*
* 1) For compression we must call filemap_fdatawrite_range() range
* twice (btrfs_fdatawrite_range() does it for us), and the generic
* helper only calls it once;
*
* 2) filemap_fdatawrite_range(), called by the generic helper only
* waits for the writeback to complete, i.e. for IO to be done, and
* not for the ordered extents to complete. We need to wait for them
* to complete so that new file extent items are in the fs tree.
*/
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
else
wb_len = ALIGN(*len, bs);
/*
* Since we don't lock ranges, wait for ongoing lockless dio writes (as
* any in progress could create its ordered extents after we wait for
* existing ordered extents below).
*/
inode_dio_wait(inode_in);
if (!same_inode)
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
inode_dio_wait(inode_out);
btrfs: Flush before reflinking any extent to prevent NOCOW write falling back to COW without data reservation [BUG] The following script can cause unexpected fsync failure: #!/bin/bash dev=/dev/test/test mnt=/mnt/btrfs mkfs.btrfs -f $dev -b 512M > /dev/null mount $dev $mnt -o nospace_cache # Prealloc one extent xfs_io -f -c "falloc 8k 64m" $mnt/file1 # Fill the remaining data space xfs_io -f -c "pwrite 0 -b 4k 512M" $mnt/padding sync # Write into the prealloc extent xfs_io -c "pwrite 1m 16m" $mnt/file1 # Reflink then fsync, fsync would fail due to ENOSPC xfs_io -c "reflink $mnt/file1 8k 0 4k" -c "fsync" $mnt/file1 umount $dev The fsync fails with ENOSPC, and the last page of the buffered write is lost. [CAUSE] This is caused by: - Btrfs' back reference only has extent level granularity So write into shared extent must be COWed even only part of the extent is shared. So for above script we have: - fallocate Create a preallocated extent where we can do NOCOW write. - fill all the remaining data and unallocated space - buffered write into preallocated space As we have not enough space available for data and the extent is not shared (yet) we fall into NOCOW mode. - reflink Now part of the large preallocated extent is shared, later write into that extent must be COWed. - fsync triggers writeback But now the extent is shared and therefore we must fallback into COW mode, which fails with ENOSPC since there's not enough space to allocate data extents. [WORKAROUND] The workaround is to ensure any buffered write in the related extents (not just the reflink source range) get flushed before reflink/dedupe, so that NOCOW writes succeed that happened before reflinking succeed. The workaround is expensive, we could do it better by only flushing NOCOW range, but that needs extra accounting for NOCOW range. For now, fix the possible data loss first. Reviewed-by: Filipe Manana <fdmanana@suse.com> Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2019-05-08 18:49:58 +08:00
/*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*
* Btrfs' back references do not have a block level granularity, they
* work at the whole extent level.
* NOCOW buffered write without data space reserved may not be able
* to fall back to CoW due to lack of data space, thus could cause
* data loss.
*
* Here we take a shortcut by flushing the whole inode, so that all
* nocow write should reach disk as nocow before we increase the
* reference of the extent. We could do better by only flushing NOCOW
* data, but that needs extra accounting.
*
* Also we don't need to check ASYNC_EXTENT, as async extent will be
* CoWed anyway, not affecting nocow part.
*/
ret = filemap_flush(inode_in->i_mapping);
if (ret < 0)
return ret;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
wb_len);
if (ret < 0)
return ret;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
wb_len);
if (ret < 0)
return ret;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
len, remap_flags);
}
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
{
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
struct inode *src_inode = file_inode(src_file);
struct inode *dst_inode = file_inode(dst_file);
bool same_inode = dst_inode == src_inode;
int ret;
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
if (same_inode)
inode_lock(src_inode);
else
lock_two_nondirectories(src_inode, dst_inode);
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
if (ret < 0 || len == 0)
goto out_unlock;
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
if (remap_flags & REMAP_FILE_DEDUP)
ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
else
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
out_unlock:
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
if (same_inode)
inode_unlock(src_inode);
else
unlock_two_nondirectories(src_inode, dst_inode);
Btrfs: use generic_remap_file_range_prep() for cloning and deduplication Since cloning and deduplication are no longer Btrfs specific operations, we now have generic code to handle parameter validation, compare file ranges used for deduplication, clear capabilities when cloning, etc. This change makes Btrfs use it, eliminating a lot of code in Btrfs and also fixing a few bugs, such as: 1) When cloning, the destination file's capabilities were not dropped (the fstest generic/513 tests this); 2) We were not checking if the destination file is immutable; 3) Not checking if either the source or destination files are swap files (swap file support is coming soon for Btrfs); 4) System limits were not checked (resource limits and O_LARGEFILE). Note that the generic helper generic_remap_file_range_prep() does start and waits for writeback by calling filemap_write_and_wait_range(), however that is not enough for Btrfs for two reasons: 1) With compression, we need to start writeback twice in order to get the pages marked for writeback and ordered extents created; 2) filemap_write_and_wait_range() (and all its other variants) only waits for the IO to complete, but we need to wait for the ordered extents to finish, so that when we do the actual reflinking operations the file extent items are in the fs tree. This is also important due to the fact that the generic helper, for the deduplication case, compares the contents of the pages in the requested range, which might require reading extents from disk in the very unlikely case that pages get invalidated after writeback finishes (so the file extent items must be up to date in the fs tree). Since these reasons are specific to Btrfs we have to do it in the Btrfs code before calling generic_remap_file_range_prep(). This also results in a simpler way of dealing with existing delalloc in the source/target ranges, specially for the deduplication case where we used to lock all the pages first and then if we found any dealloc for the range, or ordered extent, we would unlock the pages trigger writeback and wait for ordered extents to complete, then lock all the pages again and check if deduplication can be done. So now we get a simpler approach: lock the inodes, then trigger writeback and then wait for ordered extents to complete. So make btrfs use generic_remap_file_range_prep() (XFS and OCFS2 use it) to eliminate duplicated code, fix a few bugs and benefit from future bug fixes done there - for example the recent clone and dedupe bugs involving reflinking a partial EOF block got a counterpart fix in the generic helper, since it affected all filesystems supporting these operations, so we no longer need special checks in Btrfs for them. Reviewed-by: Nikolay Borisov <nborisov@suse.com> Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: David Sterba <dsterba@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-12-07 23:25:38 +08:00
return ret < 0 ? ret : len;
}
static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root *new_root;
struct btrfs_dir_item *di;
struct btrfs_trans_handle *trans;
struct btrfs_path *path;
struct btrfs_key location;
struct btrfs_disk_key disk_key;
u64 objectid = 0;
u64 dir_id;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
if (copy_from_user(&objectid, argp, sizeof(objectid))) {
ret = -EFAULT;
goto out;
}
if (!objectid)
objectid = BTRFS_FS_TREE_OBJECTID;
location.objectid = objectid;
location.type = BTRFS_ROOT_ITEM_KEY;
location.offset = (u64)-1;
new_root = btrfs_read_fs_root_no_name(fs_info, &location);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
goto out;
}
if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
goto out;
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
path->leave_spinning = 1;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
btrfs_free_path(path);
ret = PTR_ERR(trans);
goto out;
}
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
dir_id, "default", 7, 1);
if (IS_ERR_OR_NULL(di)) {
btrfs_free_path(path);
btrfs_end_transaction(trans);
btrfs_err(fs_info,
"Umm, you don't have the default diritem, this isn't going to work");
ret = -ENOENT;
goto out;
}
btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_free_path(path);
btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
btrfs_end_transaction(trans);
out:
mnt_drop_write_file(file);
return ret;
}
static void get_block_group_info(struct list_head *groups_list,
struct btrfs_ioctl_space_info *space)
{
struct btrfs_block_group *block_group;
space->total_bytes = 0;
space->used_bytes = 0;
space->flags = 0;
list_for_each_entry(block_group, groups_list, list) {
space->flags = block_group->flags;
space->total_bytes += block_group->length;
space->used_bytes += block_group->used;
}
}
static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_space_args space_args;
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
struct btrfs_ioctl_space_info *dest_orig;
struct btrfs_ioctl_space_info __user *user_dest;
struct btrfs_space_info *info;
static const u64 types[] = {
BTRFS_BLOCK_GROUP_DATA,
BTRFS_BLOCK_GROUP_SYSTEM,
BTRFS_BLOCK_GROUP_METADATA,
BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
};
int num_types = 4;
int alloc_size;
int ret = 0;
u64 slot_count = 0;
int i, c;
if (copy_from_user(&space_args,
(struct btrfs_ioctl_space_args __user *)arg,
sizeof(space_args)))
return -EFAULT;
for (i = 0; i < num_types; i++) {
struct btrfs_space_info *tmp;
info = NULL;
rcu_read_lock();
list_for_each_entry_rcu(tmp, &fs_info->space_info,
list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
rcu_read_unlock();
if (!info)
continue;
down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c]))
slot_count++;
}
up_read(&info->groups_sem);
}
/*
* Global block reserve, exported as a space_info
*/
slot_count++;
/* space_slots == 0 means they are asking for a count */
if (space_args.space_slots == 0) {
space_args.total_spaces = slot_count;
goto out;
}
slot_count = min_t(u64, space_args.space_slots, slot_count);
alloc_size = sizeof(*dest) * slot_count;
/* we generally have at most 6 or so space infos, one for each raid
* level. So, a whole page should be more than enough for everyone
*/
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (alloc_size > PAGE_SIZE)
return -ENOMEM;
space_args.total_spaces = 0;
dest = kmalloc(alloc_size, GFP_KERNEL);
if (!dest)
return -ENOMEM;
dest_orig = dest;
/* now we have a buffer to copy into */
for (i = 0; i < num_types; i++) {
struct btrfs_space_info *tmp;
if (!slot_count)
break;
info = NULL;
rcu_read_lock();
list_for_each_entry_rcu(tmp, &fs_info->space_info,
list) {
if (tmp->flags == types[i]) {
info = tmp;
break;
}
}
rcu_read_unlock();
if (!info)
continue;
down_read(&info->groups_sem);
for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
if (!list_empty(&info->block_groups[c])) {
get_block_group_info(&info->block_groups[c],
&space);
memcpy(dest, &space, sizeof(space));
dest++;
space_args.total_spaces++;
slot_count--;
}
if (!slot_count)
break;
}
up_read(&info->groups_sem);
}
/*
* Add global block reserve
*/
if (slot_count) {
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
spin_lock(&block_rsv->lock);
space.total_bytes = block_rsv->size;
space.used_bytes = block_rsv->size - block_rsv->reserved;
spin_unlock(&block_rsv->lock);
space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
memcpy(dest, &space, sizeof(space));
space_args.total_spaces++;
}
user_dest = (struct btrfs_ioctl_space_info __user *)
(arg + sizeof(struct btrfs_ioctl_space_args));
if (copy_to_user(user_dest, dest_orig, alloc_size))
ret = -EFAULT;
kfree(dest_orig);
out:
if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
ret = -EFAULT;
return ret;
}
static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
void __user *argp)
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
{
struct btrfs_trans_handle *trans;
u64 transid;
int ret;
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
trans = btrfs_attach_transaction_barrier(root);
if (IS_ERR(trans)) {
if (PTR_ERR(trans) != -ENOENT)
return PTR_ERR(trans);
/* No running transaction, don't bother */
transid = root->fs_info->last_trans_committed;
goto out;
}
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
transid = trans->transid;
ret = btrfs_commit_transaction_async(trans, 0);
if (ret) {
btrfs_end_transaction(trans);
return ret;
}
out:
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
if (argp)
if (copy_to_user(argp, &transid, sizeof(transid)))
return -EFAULT;
return 0;
}
static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
void __user *argp)
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
{
u64 transid;
if (argp) {
if (copy_from_user(&transid, argp, sizeof(transid)))
return -EFAULT;
} else {
transid = 0; /* current trans */
}
return btrfs_wait_for_commit(fs_info, transid);
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
}
static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
{
struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
struct btrfs_ioctl_scrub_args *sa;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
ret = mnt_want_write_file(file);
if (ret)
goto out;
}
ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
/*
* Copy scrub args to user space even if btrfs_scrub_dev() returned an
* error. This is important as it allows user space to know how much
* progress scrub has done. For example, if scrub is canceled we get
* -ECANCELED from btrfs_scrub_dev() and return that error back to user
* space. Later user space can inspect the progress from the structure
* btrfs_ioctl_scrub_args and resume scrub from where it left off
* previously (btrfs-progs does this).
* If we fail to copy the btrfs_ioctl_scrub_args structure to user space
* then return -EFAULT to signal the structure was not copied or it may
* be corrupt and unreliable due to a partial copy.
*/
if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
mnt_drop_write_file(file);
out:
kfree(sa);
return ret;
}
static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return btrfs_scrub_cancel(fs_info);
}
static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_scrub_args *sa;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
return ret;
}
static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_get_dev_stats *sa;
int ret;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
kfree(sa);
return -EPERM;
}
ret = btrfs_get_dev_stats(fs_info, sa);
if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
kfree(sa);
return ret;
}
static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_dev_replace_args *p;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
p = memdup_user(arg, sizeof(*p));
if (IS_ERR(p))
return PTR_ERR(p);
switch (p->cmd) {
case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
if (sb_rdonly(fs_info->sb)) {
ret = -EROFS;
goto out;
}
if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
} else {
ret = btrfs_dev_replace_by_ioctl(fs_info, p);
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
}
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
btrfs_dev_replace_status(fs_info, p);
ret = 0;
break;
case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
p->result = btrfs_dev_replace_cancel(fs_info);
ret = 0;
break;
default:
ret = -EINVAL;
break;
}
if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
ret = -EFAULT;
out:
kfree(p);
return ret;
}
static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
{
int ret = 0;
int i;
u64 rel_ptr;
int size;
struct btrfs_ioctl_ino_path_args *ipa = NULL;
struct inode_fs_paths *ipath = NULL;
struct btrfs_path *path;
if (!capable(CAP_DAC_READ_SEARCH))
return -EPERM;
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
ipa = memdup_user(arg, sizeof(*ipa));
if (IS_ERR(ipa)) {
ret = PTR_ERR(ipa);
ipa = NULL;
goto out;
}
size = min_t(u32, ipa->size, 4096);
ipath = init_ipath(size, root, path);
if (IS_ERR(ipath)) {
ret = PTR_ERR(ipath);
ipath = NULL;
goto out;
}
ret = paths_from_inode(ipa->inum, ipath);
if (ret < 0)
goto out;
for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
rel_ptr = ipath->fspath->val[i] -
(u64)(unsigned long)ipath->fspath->val;
ipath->fspath->val[i] = rel_ptr;
}
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
ret = -EFAULT;
goto out;
}
out:
btrfs_free_path(path);
free_ipath(ipath);
kfree(ipa);
return ret;
}
static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
{
struct btrfs_data_container *inodes = ctx;
const size_t c = 3 * sizeof(u64);
if (inodes->bytes_left >= c) {
inodes->bytes_left -= c;
inodes->val[inodes->elem_cnt] = inum;
inodes->val[inodes->elem_cnt + 1] = offset;
inodes->val[inodes->elem_cnt + 2] = root;
inodes->elem_cnt += 3;
} else {
inodes->bytes_missing += c - inodes->bytes_left;
inodes->bytes_left = 0;
inodes->elem_missed += 3;
}
return 0;
}
static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
void __user *arg, int version)
{
int ret = 0;
int size;
struct btrfs_ioctl_logical_ino_args *loi;
struct btrfs_data_container *inodes = NULL;
struct btrfs_path *path = NULL;
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
bool ignore_offset;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
loi = memdup_user(arg, sizeof(*loi));
if (IS_ERR(loi))
return PTR_ERR(loi);
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
if (version == 1) {
ignore_offset = false;
size = min_t(u32, loi->size, SZ_64K);
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
} else {
/* All reserved bits must be 0 for now */
if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
ret = -EINVAL;
goto out_loi;
}
/* Only accept flags we have defined so far */
if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
ret = -EINVAL;
goto out_loi;
}
ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
size = min_t(u32, loi->size, SZ_16M);
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
}
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
goto out;
}
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
inodes = NULL;
goto out;
}
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
build_ino_list, inodes, ignore_offset);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
goto out;
ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
size);
if (ret)
ret = -EFAULT;
out:
btrfs_free_path(path);
kvfree(inodes);
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
out_loi:
kfree(loi);
return ret;
}
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs)
{
struct btrfs_balance_control *bctl = fs_info->balance_ctl;
bargs->flags = bctl->flags;
if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
if (atomic_read(&fs_info->balance_pause_req))
bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
if (atomic_read(&fs_info->balance_cancel_req))
bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
spin_lock(&fs_info->balance_lock);
memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
spin_unlock(&fs_info->balance_lock);
}
static long btrfs_ioctl_balance(struct file *file, void __user *arg)
{
struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_ioctl_balance_args *bargs;
struct btrfs_balance_control *bctl;
bool need_unlock; /* for mut. excl. ops lock */
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
again:
if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) {
mutex_lock(&fs_info->balance_mutex);
need_unlock = true;
goto locked;
}
/*
* mut. excl. ops lock is locked. Three possibilities:
* (1) some other op is running
* (2) balance is running
* (3) balance is paused -- special case (think resume)
*/
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl) {
/* this is either (2) or (3) */
if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
mutex_unlock(&fs_info->balance_mutex);
btrfs: kill btrfs_fs_info::volume_mutex Mutual exclusion of device add/rm and balance was done by the volume mutex up to version 3.7. The commit 5ac00addc7ac091109 ("Btrfs: disallow mutually exclusive admin operations from user mode") added a bit that essentially tracked the same information. The status bit has an advantage over a mutex that it can be set without restrictions of function context, so it started to be used in the mount-time resuming of balance or device replace. But we don't really need to track the same information in two ways. 1) After the previous cleanups, the main ioctl handlers for add/del/resize copy the EXCL_OP bit next to the volume mutex, here it's clearly safe. 2) Resuming balance during mount or after rw remount will set only the EXCL_OP bit and the volume_mutex is held in the kernel thread that calls btrfs_balance. 3) Resuming device replace during mount or after rw remount is done after balance and is excluded by the EXCL_OP bit. It does not take the volume_mutex at all and completely relies on the EXCL_OP bit. 4) The resuming of balance and dev-replace cannot hapen at the same time as the ioctls cannot be started in parallel. Nevertheless, a crafted image could trigger that and a warning is printed. 5) Balance is normally excluded by EXCL_OP and also uses own mutex to protect against concurrent access to its status data. There's some trickery to maintain the right lock nesting in case we need to reexamine the status in btrfs_ioctl_balance. The volume_mutex is removed and the unlock/lock sequence is left in place as we might expect other waiters to proceed. 6) Similar to 5, the unlock/lock sequence is kept in btrfs_cancel_balance to allow waiters to continue. Reviewed-by: Anand Jain <anand.jain@oracle.com> Signed-off-by: David Sterba <dsterba@suse.com>
2018-03-21 07:20:05 +08:00
/*
* Lock released to allow other waiters to continue,
* we'll reexamine the status again.
*/
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl &&
!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
/* this is (3) */
need_unlock = false;
goto locked;
}
mutex_unlock(&fs_info->balance_mutex);
goto again;
} else {
/* this is (2) */
mutex_unlock(&fs_info->balance_mutex);
ret = -EINPROGRESS;
goto out;
}
} else {
/* this is (1) */
mutex_unlock(&fs_info->balance_mutex);
ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
goto out;
}
locked:
BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
goto out_unlock;
}
if (bargs->flags & BTRFS_BALANCE_RESUME) {
if (!fs_info->balance_ctl) {
ret = -ENOTCONN;
goto out_bargs;
}
bctl = fs_info->balance_ctl;
spin_lock(&fs_info->balance_lock);
bctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
goto do_balance;
}
} else {
bargs = NULL;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
goto out_bargs;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
}
if (arg) {
memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
bctl->flags = bargs->flags;
} else {
/* balance everything - no filters */
bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
}
if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
ret = -EINVAL;
goto out_bctl;
}
do_balance:
/*
* Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to
* btrfs_balance. bctl is freed in reset_balance_state, or, if
* restriper was paused all the way until unmount, in free_fs_info.
* The flag should be cleared after reset_balance_state.
*/
need_unlock = false;
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
if ((ret == 0 || ret == -ECANCELED) && arg) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
out_bctl:
kfree(bctl);
out_bargs:
kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
out:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
switch (cmd) {
case BTRFS_BALANCE_CTL_PAUSE:
return btrfs_pause_balance(fs_info);
case BTRFS_BALANCE_CTL_CANCEL:
return btrfs_cancel_balance(fs_info);
}
return -EINVAL;
}
static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_balance_args *bargs;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
mutex_lock(&fs_info->balance_mutex);
if (!fs_info->balance_ctl) {
ret = -ENOTCONN;
goto out;
}
bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
if (!bargs) {
ret = -ENOMEM;
goto out;
}
btrfs_update_ioctl_balance_args(fs_info, bargs);
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
kfree(bargs);
out:
mutex_unlock(&fs_info->balance_mutex);
return ret;
}
static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_ctl_args *sa;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa)) {
ret = PTR_ERR(sa);
goto drop_write;
}
down_write(&fs_info->subvol_sem);
switch (sa->cmd) {
case BTRFS_QUOTA_CTL_ENABLE:
ret = btrfs_quota_enable(fs_info);
break;
case BTRFS_QUOTA_CTL_DISABLE:
ret = btrfs_quota_disable(fs_info);
break;
default:
ret = -EINVAL;
break;
}
kfree(sa);
up_write(&fs_info->subvol_sem);
drop_write:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_assign_args *sa;
struct btrfs_trans_handle *trans;
int ret;
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa)) {
ret = PTR_ERR(sa);
goto drop_write;
}
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
if (sa->assign) {
ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
} else {
ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
}
/* update qgroup status and info */
err = btrfs_run_qgroups(trans);
if (err < 0)
btrfs_handle_fs_error(fs_info, err,
"failed to update qgroup status and info");
err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
out:
kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_create_args *sa;
struct btrfs_trans_handle *trans;
int ret;
int err;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa)) {
ret = PTR_ERR(sa);
goto drop_write;
}
if (!sa->qgroupid) {
ret = -EINVAL;
goto out;
}
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
if (sa->create) {
ret = btrfs_create_qgroup(trans, sa->qgroupid);
} else {
ret = btrfs_remove_qgroup(trans, sa->qgroupid);
}
err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
out:
kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_ioctl_qgroup_limit_args *sa;
struct btrfs_trans_handle *trans;
int ret;
int err;
u64 qgroupid;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa)) {
ret = PTR_ERR(sa);
goto drop_write;
}
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out;
}
qgroupid = sa->qgroupid;
if (!qgroupid) {
/* take the current subvol as qgroup */
qgroupid = root->root_key.objectid;
}
ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
err = btrfs_end_transaction(trans);
if (err && !ret)
ret = err;
out:
kfree(sa);
drop_write:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret)
return ret;
qsa = memdup_user(arg, sizeof(*qsa));
if (IS_ERR(qsa)) {
ret = PTR_ERR(qsa);
goto drop_write;
}
if (qsa->flags) {
ret = -EINVAL;
goto out;
}
ret = btrfs_qgroup_rescan(fs_info);
out:
kfree(qsa);
drop_write:
mnt_drop_write_file(file);
return ret;
}
static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_ioctl_quota_rescan_args *qsa;
int ret = 0;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
if (!qsa)
return -ENOMEM;
if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
qsa->flags = 1;
qsa->progress = fs_info->qgroup_rescan_progress.objectid;
}
if (copy_to_user(arg, qsa, sizeof(*qsa)))
ret = -EFAULT;
kfree(qsa);
return ret;
}
static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
void __user *arg)
{
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
return btrfs_qgroup_wait_for_completion(fs_info, true);
}
static long _btrfs_ioctl_set_received_subvol(struct file *file,
struct btrfs_ioctl_received_subvol_args *sa)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_root_item *root_item = &root->root_item;
struct btrfs_trans_handle *trans;
vfs: change inode times to use struct timespec64 struct timespec is not y2038 safe. Transition vfs to use y2038 safe struct timespec64 instead. The change was made with the help of the following cocinelle script. This catches about 80% of the changes. All the header file and logic changes are included in the first 5 rules. The rest are trivial substitutions. I avoid changing any of the function signatures or any other filesystem specific data structures to keep the patch simple for review. The script can be a little shorter by combining different cases. But, this version was sufficient for my usecase. virtual patch @ depends on patch @ identifier now; @@ - struct timespec + struct timespec64 current_time ( ... ) { - struct timespec now = current_kernel_time(); + struct timespec64 now = current_kernel_time64(); ... - return timespec_trunc( + return timespec64_trunc( ... ); } @ depends on patch @ identifier xtime; @@ struct \( iattr \| inode \| kstat \) { ... - struct timespec xtime; + struct timespec64 xtime; ... } @ depends on patch @ identifier t; @@ struct inode_operations { ... int (*update_time) (..., - struct timespec t, + struct timespec64 t, ...); ... } @ depends on patch @ identifier t; identifier fn_update_time =~ "update_time$"; @@ fn_update_time (..., - struct timespec *t, + struct timespec64 *t, ...) { ... } @ depends on patch @ identifier t; @@ lease_get_mtime( ... , - struct timespec *t + struct timespec64 *t ) { ... } @te depends on patch forall@ identifier ts; local idexpression struct inode *inode_node; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn_update_time =~ "update_time$"; identifier fn; expression e, E3; local idexpression struct inode *node1; local idexpression struct inode *node2; local idexpression struct iattr *attr1; local idexpression struct iattr *attr2; local idexpression struct iattr attr; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; @@ ( ( - struct timespec ts; + struct timespec64 ts; | - struct timespec ts = current_time(inode_node); + struct timespec64 ts = current_time(inode_node); ) <+... when != ts ( - timespec_equal(&inode_node->i_xtime, &ts) + timespec64_equal(&inode_node->i_xtime, &ts) | - timespec_equal(&ts, &inode_node->i_xtime) + timespec64_equal(&ts, &inode_node->i_xtime) | - timespec_compare(&inode_node->i_xtime, &ts) + timespec64_compare(&inode_node->i_xtime, &ts) | - timespec_compare(&ts, &inode_node->i_xtime) + timespec64_compare(&ts, &inode_node->i_xtime) | ts = current_time(e) | fn_update_time(..., &ts,...) | inode_node->i_xtime = ts | node1->i_xtime = ts | ts = inode_node->i_xtime | <+... attr1->ia_xtime ...+> = ts | ts = attr1->ia_xtime | ts.tv_sec | ts.tv_nsec | btrfs_set_stack_timespec_sec(..., ts.tv_sec) | btrfs_set_stack_timespec_nsec(..., ts.tv_nsec) | - ts = timespec64_to_timespec( + ts = ... -) | - ts = ktime_to_timespec( + ts = ktime_to_timespec64( ...) | - ts = E3 + ts = timespec_to_timespec64(E3) | - ktime_get_real_ts(&ts) + ktime_get_real_ts64(&ts) | fn(..., - ts + timespec64_to_timespec(ts) ,...) ) ...+> ( <... when != ts - return ts; + return timespec64_to_timespec(ts); ...> ) | - timespec_equal(&node1->i_xtime1, &node2->i_xtime2) + timespec64_equal(&node1->i_xtime2, &node2->i_xtime2) | - timespec_equal(&node1->i_xtime1, &attr2->ia_xtime2) + timespec64_equal(&node1->i_xtime2, &attr2->ia_xtime2) | - timespec_compare(&node1->i_xtime1, &node2->i_xtime2) + timespec64_compare(&node1->i_xtime1, &node2->i_xtime2) | node1->i_xtime1 = - timespec_trunc(attr1->ia_xtime1, + timespec64_trunc(attr1->ia_xtime1, ...) | - attr1->ia_xtime1 = timespec_trunc(attr2->ia_xtime2, + attr1->ia_xtime1 = timespec64_trunc(attr2->ia_xtime2, ...) | - ktime_get_real_ts(&attr1->ia_xtime1) + ktime_get_real_ts64(&attr1->ia_xtime1) | - ktime_get_real_ts(&attr.ia_xtime1) + ktime_get_real_ts64(&attr.ia_xtime1) ) @ depends on patch @ struct inode *node; struct iattr *attr; identifier fn; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; expression e; @@ ( - fn(node->i_xtime); + fn(timespec64_to_timespec(node->i_xtime)); | fn(..., - node->i_xtime); + timespec64_to_timespec(node->i_xtime)); | - e = fn(attr->ia_xtime); + e = fn(timespec64_to_timespec(attr->ia_xtime)); ) @ depends on patch forall @ struct inode *node; struct iattr *attr; identifier i_xtime =~ "^i_[acm]time$"; identifier ia_xtime =~ "^ia_[acm]time$"; identifier fn; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); fn (..., - &attr->ia_xtime, + &ts, ...); ) ...+> } @ depends on patch forall @ struct inode *node; struct iattr *attr; struct kstat *stat; identifier ia_xtime =~ "^ia_[acm]time$"; identifier i_xtime =~ "^i_[acm]time$"; identifier xtime =~ "^[acm]time$"; identifier fn, ret; @@ { + struct timespec ts; <+... ( + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime, + &ts, ...); | + ts = timespec64_to_timespec(node->i_xtime); ret = fn (..., - &node->i_xtime); + &ts); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime, + &ts, ...); | + ts = timespec64_to_timespec(attr->ia_xtime); ret = fn (..., - &attr->ia_xtime); + &ts); | + ts = timespec64_to_timespec(stat->xtime); ret = fn (..., - &stat->xtime); + &ts); ) ...+> } @ depends on patch @ struct inode *node; struct inode *node2; identifier i_xtime1 =~ "^i_[acm]time$"; identifier i_xtime2 =~ "^i_[acm]time$"; identifier i_xtime3 =~ "^i_[acm]time$"; struct iattr *attrp; struct iattr *attrp2; struct iattr attr ; identifier ia_xtime1 =~ "^ia_[acm]time$"; identifier ia_xtime2 =~ "^ia_[acm]time$"; struct kstat *stat; struct kstat stat1; struct timespec64 ts; identifier xtime =~ "^[acmb]time$"; expression e; @@ ( ( node->i_xtime2 \| attrp->ia_xtime2 \| attr.ia_xtime2 \) = node->i_xtime1 ; | node->i_xtime2 = \( node2->i_xtime1 \| timespec64_trunc(...) \); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | node->i_xtime1 = node->i_xtime3 = \(ts \| current_time(...) \); | stat->xtime = node2->i_xtime1; | stat1.xtime = node2->i_xtime1; | ( node->i_xtime2 \| attrp->ia_xtime2 \) = attrp->ia_xtime1 ; | ( attrp->ia_xtime1 \| attr.ia_xtime1 \) = attrp2->ia_xtime2; | - e = node->i_xtime1; + e = timespec64_to_timespec( node->i_xtime1 ); | - e = attrp->ia_xtime1; + e = timespec64_to_timespec( attrp->ia_xtime1 ); | node->i_xtime1 = current_time(...); | node->i_xtime2 = node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | node->i_xtime1 = node->i_xtime3 = - e; + timespec_to_timespec64(e); | - node->i_xtime1 = e; + node->i_xtime1 = timespec_to_timespec64(e); ) Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com> Cc: <anton@tuxera.com> Cc: <balbi@kernel.org> Cc: <bfields@fieldses.org> Cc: <darrick.wong@oracle.com> Cc: <dhowells@redhat.com> Cc: <dsterba@suse.com> Cc: <dwmw2@infradead.org> Cc: <hch@lst.de> Cc: <hirofumi@mail.parknet.co.jp> Cc: <hubcap@omnibond.com> Cc: <jack@suse.com> Cc: <jaegeuk@kernel.org> Cc: <jaharkes@cs.cmu.edu> Cc: <jslaby@suse.com> Cc: <keescook@chromium.org> Cc: <mark@fasheh.com> Cc: <miklos@szeredi.hu> Cc: <nico@linaro.org> Cc: <reiserfs-devel@vger.kernel.org> Cc: <richard@nod.at> Cc: <sage@redhat.com> Cc: <sfrench@samba.org> Cc: <swhiteho@redhat.com> Cc: <tj@kernel.org> Cc: <trond.myklebust@primarydata.com> Cc: <tytso@mit.edu> Cc: <viro@zeniv.linux.org.uk>
2018-05-09 10:36:02 +08:00
struct timespec64 ct = current_time(inode);
int ret = 0;
int received_uuid_changed;
if (!inode_owner_or_capable(inode))
return -EPERM;
ret = mnt_want_write_file(file);
if (ret < 0)
return ret;
down_write(&fs_info->subvol_sem);
if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
ret = -EINVAL;
goto out;
}
if (btrfs_root_readonly(root)) {
ret = -EROFS;
goto out;
}
/*
* 1 - root item
* 2 - uuid items (received uuid + subvol uuid)
*/
trans = btrfs_start_transaction(root, 3);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
sa->rtransid = trans->transid;
sa->rtime.sec = ct.tv_sec;
sa->rtime.nsec = ct.tv_nsec;
received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
BTRFS_UUID_SIZE);
if (received_uuid_changed &&
!btrfs_is_empty_uuid(root_item->received_uuid)) {
ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret && ret != -ENOENT) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
}
memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
btrfs_set_root_stransid(root_item, sa->stransid);
btrfs_set_root_rtransid(root_item, sa->rtransid);
btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
if (ret < 0) {
btrfs_end_transaction(trans);
goto out;
}
if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
ret = btrfs_uuid_tree_add(trans, sa->uuid,
BTRFS_UUID_KEY_RECEIVED_SUBVOL,
root->root_key.objectid);
if (ret < 0 && ret != -EEXIST) {
btrfs_abort_transaction(trans, ret);
btrfs_end_transaction(trans);
goto out;
}
}
ret = btrfs_commit_transaction(trans);
out:
up_write(&fs_info->subvol_sem);
mnt_drop_write_file(file);
return ret;
}
#ifdef CONFIG_64BIT
static long btrfs_ioctl_set_received_subvol_32(struct file *file,
void __user *arg)
{
struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
struct btrfs_ioctl_received_subvol_args *args64 = NULL;
int ret = 0;
args32 = memdup_user(arg, sizeof(*args32));
if (IS_ERR(args32))
return PTR_ERR(args32);
args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
if (!args64) {
ret = -ENOMEM;
goto out;
}
memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
args64->stransid = args32->stransid;
args64->rtransid = args32->rtransid;
args64->stime.sec = args32->stime.sec;
args64->stime.nsec = args32->stime.nsec;
args64->rtime.sec = args32->rtime.sec;
args64->rtime.nsec = args32->rtime.nsec;
args64->flags = args32->flags;
ret = _btrfs_ioctl_set_received_subvol(file, args64);
if (ret)
goto out;
memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
args32->stransid = args64->stransid;
args32->rtransid = args64->rtransid;
args32->stime.sec = args64->stime.sec;
args32->stime.nsec = args64->stime.nsec;
args32->rtime.sec = args64->rtime.sec;
args32->rtime.nsec = args64->rtime.nsec;
args32->flags = args64->flags;
ret = copy_to_user(arg, args32, sizeof(*args32));
if (ret)
ret = -EFAULT;
out:
kfree(args32);
kfree(args64);
return ret;
}
#endif
static long btrfs_ioctl_set_received_subvol(struct file *file,
void __user *arg)
{
struct btrfs_ioctl_received_subvol_args *sa = NULL;
int ret = 0;
sa = memdup_user(arg, sizeof(*sa));
if (IS_ERR(sa))
return PTR_ERR(sa);
ret = _btrfs_ioctl_set_received_subvol(file, sa);
if (ret)
goto out;
ret = copy_to_user(arg, sa, sizeof(*sa));
if (ret)
ret = -EFAULT;
out:
kfree(sa);
return ret;
}
static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
void __user *arg)
{
size_t len;
int ret;
char label[BTRFS_LABEL_SIZE];
spin_lock(&fs_info->super_lock);
memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
spin_unlock(&fs_info->super_lock);
len = strnlen(label, BTRFS_LABEL_SIZE);
if (len == BTRFS_LABEL_SIZE) {
btrfs_warn(fs_info,
"label is too long, return the first %zu bytes",
--len);
}
ret = copy_to_user(arg, label, len);
return ret ? -EFAULT : 0;
}
static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_trans_handle *trans;
char label[BTRFS_LABEL_SIZE];
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(label, arg, sizeof(label)))
return -EFAULT;
if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
btrfs_err(fs_info,
"unable to set label with more than %d bytes",
BTRFS_LABEL_SIZE - 1);
return -EINVAL;
}
ret = mnt_want_write_file(file);
if (ret)
return ret;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_unlock;
}
spin_lock(&fs_info->super_lock);
strcpy(super_block->label, label);
spin_unlock(&fs_info->super_lock);
ret = btrfs_commit_transaction(trans);
out_unlock:
mnt_drop_write_file(file);
return ret;
}
#define INIT_FEATURE_FLAGS(suffix) \
{ .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
.compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
.incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
int btrfs_ioctl_get_supported_features(void __user *arg)
{
static const struct btrfs_ioctl_feature_flags features[3] = {
INIT_FEATURE_FLAGS(SUPP),
INIT_FEATURE_FLAGS(SAFE_SET),
INIT_FEATURE_FLAGS(SAFE_CLEAR)
};
if (copy_to_user(arg, &features, sizeof(features)))
return -EFAULT;
return 0;
}
static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
void __user *arg)
{
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags features;
features.compat_flags = btrfs_super_compat_flags(super_block);
features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
features.incompat_flags = btrfs_super_incompat_flags(super_block);
if (copy_to_user(arg, &features, sizeof(features)))
return -EFAULT;
return 0;
}
static int check_feature_bits(struct btrfs_fs_info *fs_info,
enum btrfs_feature_set set,
u64 change_mask, u64 flags, u64 supported_flags,
u64 safe_set, u64 safe_clear)
{
const char *type = btrfs_feature_set_name(set);
char *names;
u64 disallowed, unsupported;
u64 set_mask = flags & change_mask;
u64 clear_mask = ~flags & change_mask;
unsupported = set_mask & ~supported_flags;
if (unsupported) {
names = btrfs_printable_features(set, unsupported);
if (names) {
btrfs_warn(fs_info,
"this kernel does not support the %s feature bit%s",
names, strchr(names, ',') ? "s" : "");
kfree(names);
} else
btrfs_warn(fs_info,
"this kernel does not support %s bits 0x%llx",
type, unsupported);
return -EOPNOTSUPP;
}
disallowed = set_mask & ~safe_set;
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
if (names) {
btrfs_warn(fs_info,
"can't set the %s feature bit%s while mounted",
names, strchr(names, ',') ? "s" : "");
kfree(names);
} else
btrfs_warn(fs_info,
"can't set %s bits 0x%llx while mounted",
type, disallowed);
return -EPERM;
}
disallowed = clear_mask & ~safe_clear;
if (disallowed) {
names = btrfs_printable_features(set, disallowed);
if (names) {
btrfs_warn(fs_info,
"can't clear the %s feature bit%s while mounted",
names, strchr(names, ',') ? "s" : "");
kfree(names);
} else
btrfs_warn(fs_info,
"can't clear %s bits 0x%llx while mounted",
type, disallowed);
return -EPERM;
}
return 0;
}
#define check_feature(fs_info, change_mask, flags, mask_base) \
check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
BTRFS_FEATURE_ ## mask_base ## _SUPP, \
BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_super_block *super_block = fs_info->super_copy;
struct btrfs_ioctl_feature_flags flags[2];
struct btrfs_trans_handle *trans;
u64 newflags;
int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (copy_from_user(flags, arg, sizeof(flags)))
return -EFAULT;
/* Nothing to do */
if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
!flags[0].incompat_flags)
return 0;
ret = check_feature(fs_info, flags[0].compat_flags,
flags[1].compat_flags, COMPAT);
if (ret)
return ret;
ret = check_feature(fs_info, flags[0].compat_ro_flags,
flags[1].compat_ro_flags, COMPAT_RO);
if (ret)
return ret;
ret = check_feature(fs_info, flags[0].incompat_flags,
flags[1].incompat_flags, INCOMPAT);
if (ret)
return ret;
ret = mnt_want_write_file(file);
if (ret)
return ret;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
goto out_drop_write;
}
spin_lock(&fs_info->super_lock);
newflags = btrfs_super_compat_flags(super_block);
newflags |= flags[0].compat_flags & flags[1].compat_flags;
newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
btrfs_set_super_compat_flags(super_block, newflags);
newflags = btrfs_super_compat_ro_flags(super_block);
newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
btrfs_set_super_compat_ro_flags(super_block, newflags);
newflags = btrfs_super_incompat_flags(super_block);
newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
btrfs_set_super_incompat_flags(super_block, newflags);
spin_unlock(&fs_info->super_lock);
ret = btrfs_commit_transaction(trans);
out_drop_write:
mnt_drop_write_file(file);
return ret;
}
static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
{
struct btrfs_ioctl_send_args *arg;
int ret;
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
struct btrfs_ioctl_send_args_32 args32;
ret = copy_from_user(&args32, argp, sizeof(args32));
if (ret)
return -EFAULT;
arg = kzalloc(sizeof(*arg), GFP_KERNEL);
if (!arg)
return -ENOMEM;
arg->send_fd = args32.send_fd;
arg->clone_sources_count = args32.clone_sources_count;
arg->clone_sources = compat_ptr(args32.clone_sources);
arg->parent_root = args32.parent_root;
arg->flags = args32.flags;
memcpy(arg->reserved, args32.reserved,
sizeof(args32.reserved));
#else
return -ENOTTY;
#endif
} else {
arg = memdup_user(argp, sizeof(*arg));
if (IS_ERR(arg))
return PTR_ERR(arg);
}
ret = btrfs_ioctl_send(file, arg);
kfree(arg);
return ret;
}
long btrfs_ioctl(struct file *file, unsigned int
cmd, unsigned long arg)
{
struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
void __user *argp = (void __user *)arg;
switch (cmd) {
case FS_IOC_GETFLAGS:
return btrfs_ioctl_getflags(file, argp);
case FS_IOC_SETFLAGS:
return btrfs_ioctl_setflags(file, argp);
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case FS_IOC_GETFSLABEL:
return btrfs_ioctl_get_fslabel(fs_info, argp);
case FS_IOC_SETFSLABEL:
return btrfs_ioctl_set_fslabel(file, argp);
case FITRIM:
return btrfs_ioctl_fitrim(fs_info, argp);
case BTRFS_IOC_SNAP_CREATE:
return btrfs_ioctl_snap_create(file, argp, 0);
case BTRFS_IOC_SNAP_CREATE_V2:
return btrfs_ioctl_snap_create_v2(file, argp, 0);
case BTRFS_IOC_SUBVOL_CREATE:
return btrfs_ioctl_snap_create(file, argp, 1);
case BTRFS_IOC_SUBVOL_CREATE_V2:
return btrfs_ioctl_snap_create_v2(file, argp, 1);
case BTRFS_IOC_SNAP_DESTROY:
return btrfs_ioctl_snap_destroy(file, argp);
case BTRFS_IOC_SUBVOL_GETFLAGS:
return btrfs_ioctl_subvol_getflags(file, argp);
case BTRFS_IOC_SUBVOL_SETFLAGS:
return btrfs_ioctl_subvol_setflags(file, argp);
case BTRFS_IOC_DEFAULT_SUBVOL:
return btrfs_ioctl_default_subvol(file, argp);
case BTRFS_IOC_DEFRAG:
return btrfs_ioctl_defrag(file, NULL);
case BTRFS_IOC_DEFRAG_RANGE:
return btrfs_ioctl_defrag(file, argp);
case BTRFS_IOC_RESIZE:
return btrfs_ioctl_resize(file, argp);
case BTRFS_IOC_ADD_DEV:
return btrfs_ioctl_add_dev(fs_info, argp);
case BTRFS_IOC_RM_DEV:
return btrfs_ioctl_rm_dev(file, argp);
case BTRFS_IOC_RM_DEV_V2:
return btrfs_ioctl_rm_dev_v2(file, argp);
case BTRFS_IOC_FS_INFO:
return btrfs_ioctl_fs_info(fs_info, argp);
case BTRFS_IOC_DEV_INFO:
return btrfs_ioctl_dev_info(fs_info, argp);
case BTRFS_IOC_BALANCE:
return btrfs_ioctl_balance(file, NULL);
case BTRFS_IOC_TREE_SEARCH:
return btrfs_ioctl_tree_search(file, argp);
case BTRFS_IOC_TREE_SEARCH_V2:
return btrfs_ioctl_tree_search_v2(file, argp);
case BTRFS_IOC_INO_LOOKUP:
return btrfs_ioctl_ino_lookup(file, argp);
case BTRFS_IOC_INO_PATHS:
return btrfs_ioctl_ino_to_path(root, argp);
case BTRFS_IOC_LOGICAL_INO:
btrfs: add a flags argument to LOGICAL_INO and call it LOGICAL_INO_V2 Now that check_extent_in_eb()'s extent offset filter can be turned off, we need a way to do it from userspace. Add a 'flags' field to the btrfs_logical_ino_args structure to disable extent offset filtering, taking the place of one of the existing reserved[] fields. Previous versions of LOGICAL_INO neglected to check whether any of the reserved fields have non-zero values. Assigning meaning to those fields now may change the behavior of existing programs that left these fields uninitialized. The lack of a zero check also means that new programs have no way to know whether the kernel is honoring the flags field. To avoid these problems, define a new ioctl LOGICAL_INO_V2. We can use the same argument layout as LOGICAL_INO, but shorten the reserved[] array by one element and turn it into the 'flags' field. The V2 ioctl explicitly checks that reserved fields and unsupported flag bits are zero so that userspace can negotiate future feature bits as they are defined. Since the memory layouts of the two ioctls' arguments are compatible, there is no need for a separate function for logical_to_ino_v2 (contrast with tree_search_v2 vs tree_search where the layout and code are quite different). A version parameter and an 'if' statement will suffice. Now that we have a flags field in logical_ino_args, add a flag BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET to get the behavior we want, and pass it down the stack to iterate_inodes_from_logical. Motivation and background, copied from the patchset cover letter: Suppose we have a file with one extent: root@tester:~# zcat /usr/share/doc/cpio/changelog.gz > /test/a root@tester:~# sync Split the extent by overwriting it in the middle: root@tester:~# cat /dev/urandom | dd bs=4k seek=2 skip=2 count=1 conv=notrunc of=/test/a We should now have 3 extent refs to 2 extents, with one block unreachable. The extent tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 2 [...] item 9 key (1103101952 EXTENT_ITEM 73728) itemoff 15942 itemsize 53 extent refs 2 gen 29 flags DATA extent data backref root 5 objectid 261 offset 0 count 2 [...] item 11 key (1103175680 EXTENT_ITEM 4096) itemoff 15865 itemsize 53 extent refs 1 gen 30 flags DATA extent data backref root 5 objectid 261 offset 8192 count 1 [...] and the ref tree looks like: root@tester:~# btrfs-debug-tree /dev/vdc -t 5 [...] item 6 key (261 EXTENT_DATA 0) itemoff 15825 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 0 nr 8192 ram 73728 extent compression(none) item 7 key (261 EXTENT_DATA 8192) itemoff 15772 itemsize 53 extent data disk byte 1103175680 nr 4096 extent data offset 0 nr 4096 ram 4096 extent compression(none) item 8 key (261 EXTENT_DATA 12288) itemoff 15719 itemsize 53 extent data disk byte 1103101952 nr 73728 extent data offset 12288 nr 61440 ram 73728 extent compression(none) [...] There are two references to the same extent with different, non-overlapping byte offsets: [------------------72K extent at 1103101952----------------------] [--8K----------------|--4K unreachable----|--60K-----------------] ^ ^ | | [--8K ref offset 0--][--4K ref offset 0--][--60K ref offset 12K--] | v [-----4K extent-----] at 1103175680 We want to find all of the references to extent bytenr 1103101952. Without the patch (and without running btrfs-debug-tree), we have to do it with 18 LOGICAL_INO calls: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO inode 261 offset 0 root 5 root@tester:~# for x in $(seq 0 17); do btrfs ins log $((1103101952 + x * 4096)) -P /test/; done 2>&1 | grep inode inode 261 offset 0 root 5 inode 261 offset 4096 root 5 <- same extent ref as offset 0 (offset 8192 returns empty set, not reachable) inode 261 offset 12288 root 5 inode 261 offset 16384 root 5 \ inode 261 offset 20480 root 5 | inode 261 offset 24576 root 5 | inode 261 offset 28672 root 5 | inode 261 offset 32768 root 5 | inode 261 offset 36864 root 5 \ inode 261 offset 40960 root 5 > all the same extent ref as offset 12288. inode 261 offset 45056 root 5 / More processing required in userspace inode 261 offset 49152 root 5 | to figure out these are all duplicates. inode 261 offset 53248 root 5 | inode 261 offset 57344 root 5 | inode 261 offset 61440 root 5 | inode 261 offset 65536 root 5 | inode 261 offset 69632 root 5 / In the worst case the extents are 128MB long, and we have to do 32768 iterations of the loop to find one 4K extent ref. With the patch, we just use one call to map all refs to the extent at once: root@tester:~# btrfs ins log 1103101952 -P /test/ Using LOGICAL_INO_V2 inode 261 offset 0 root 5 inode 261 offset 12288 root 5 The TREE_SEARCH ioctl allows userspace to retrieve the offset and extent bytenr fields easily once the root, inode and offset are known. This is sufficient information to build a complete map of the extent and all of its references. Userspace can use this information to make better choices to dedup or defrag. Signed-off-by: Zygo Blaxell <ce3g8jdj@umail.furryterror.org> Reviewed-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> Tested-by: Hans van Kranenburg <hans.van.kranenburg@mendix.com> [ copy background and motivation from cover letter ] Signed-off-by: David Sterba <dsterba@suse.com>
2017-09-23 01:58:46 +08:00
return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
case BTRFS_IOC_LOGICAL_INO_V2:
return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
case BTRFS_IOC_SPACE_INFO:
return btrfs_ioctl_space_info(fs_info, argp);
Btrfs: fix sync fs to actually wait for all data to be persisted Currently the fs sync function (super.c:btrfs_sync_fs()) doesn't wait for delayed work to finish before returning success to the caller. This change fixes this, ensuring that there's no data loss if a power failure happens right after fs sync returns success to the caller and before the next commit happens. Steps to reproduce the data loss issue: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ perl -e '$d = ("\x41" x 6001); open($f,">","/mnt/btrfs/foobar"); print $f $d; close($f);' && btrfs fi sync /mnt/btrfs Right after the btrfs fi sync command (a second or 2 for example), power off the machine and reboot it. The file will be empty, as it can be verified after mounting the filesystem and through btrfs-debug-tree: $ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8 item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36 location key (257 INODE_ITEM 0) type FILE namelen 6 datalen 0 name: foobar item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160 inode generation 7 transid 7 size 0 block group 0 mode 100644 links 1 item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16 inode ref index 2 namelen 6 name: foobar checksum tree key (CSUM_TREE ROOT_ITEM 0) leaf 29429760 items 0 free space 3995 generation 7 owner 7 fs uuid 6192815c-af2a-4b75-b3db-a959ffb6166e chunk uuid b529c44b-938c-4d3d-910a-013b4700bcae uuid tree key (UUID_TREE ROOT_ITEM 0) After this patch, the data loss no longer happens after a power failure and btrfs-debug-tree shows: $ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8 item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36 location key (257 INODE_ITEM 0) type FILE namelen 6 datalen 0 name: foobar item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160 inode generation 6 transid 6 size 6001 block group 0 mode 100644 links 1 item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16 inode ref index 2 namelen 6 name: foobar item 6 key (257 EXTENT_DATA 0) itemoff 3522 itemsize 53 extent data disk byte 12845056 nr 8192 extent data offset 0 nr 8192 ram 8192 extent compression 0 checksum tree key (CSUM_TREE ROOT_ITEM 0) Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Reviewed-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-23 18:35:11 +08:00
case BTRFS_IOC_SYNC: {
int ret;
ret = btrfs_start_delalloc_roots(fs_info, -1);
Btrfs: fix sync fs to actually wait for all data to be persisted Currently the fs sync function (super.c:btrfs_sync_fs()) doesn't wait for delayed work to finish before returning success to the caller. This change fixes this, ensuring that there's no data loss if a power failure happens right after fs sync returns success to the caller and before the next commit happens. Steps to reproduce the data loss issue: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ perl -e '$d = ("\x41" x 6001); open($f,">","/mnt/btrfs/foobar"); print $f $d; close($f);' && btrfs fi sync /mnt/btrfs Right after the btrfs fi sync command (a second or 2 for example), power off the machine and reboot it. The file will be empty, as it can be verified after mounting the filesystem and through btrfs-debug-tree: $ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8 item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36 location key (257 INODE_ITEM 0) type FILE namelen 6 datalen 0 name: foobar item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160 inode generation 7 transid 7 size 0 block group 0 mode 100644 links 1 item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16 inode ref index 2 namelen 6 name: foobar checksum tree key (CSUM_TREE ROOT_ITEM 0) leaf 29429760 items 0 free space 3995 generation 7 owner 7 fs uuid 6192815c-af2a-4b75-b3db-a959ffb6166e chunk uuid b529c44b-938c-4d3d-910a-013b4700bcae uuid tree key (UUID_TREE ROOT_ITEM 0) After this patch, the data loss no longer happens after a power failure and btrfs-debug-tree shows: $ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8 item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36 location key (257 INODE_ITEM 0) type FILE namelen 6 datalen 0 name: foobar item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160 inode generation 6 transid 6 size 6001 block group 0 mode 100644 links 1 item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16 inode ref index 2 namelen 6 name: foobar item 6 key (257 EXTENT_DATA 0) itemoff 3522 itemsize 53 extent data disk byte 12845056 nr 8192 extent data offset 0 nr 8192 ram 8192 extent compression 0 checksum tree key (CSUM_TREE ROOT_ITEM 0) Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Reviewed-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-23 18:35:11 +08:00
if (ret)
return ret;
ret = btrfs_sync_fs(inode->i_sb, 1);
/*
* The transaction thread may want to do more work,
* namely it pokes the cleaner kthread that will start
* processing uncleaned subvols.
*/
wake_up_process(fs_info->transaction_kthread);
Btrfs: fix sync fs to actually wait for all data to be persisted Currently the fs sync function (super.c:btrfs_sync_fs()) doesn't wait for delayed work to finish before returning success to the caller. This change fixes this, ensuring that there's no data loss if a power failure happens right after fs sync returns success to the caller and before the next commit happens. Steps to reproduce the data loss issue: $ mkfs.btrfs -f /dev/sdb3 $ mount /dev/sdb3 /mnt/btrfs $ perl -e '$d = ("\x41" x 6001); open($f,">","/mnt/btrfs/foobar"); print $f $d; close($f);' && btrfs fi sync /mnt/btrfs Right after the btrfs fi sync command (a second or 2 for example), power off the machine and reboot it. The file will be empty, as it can be verified after mounting the filesystem and through btrfs-debug-tree: $ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8 item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36 location key (257 INODE_ITEM 0) type FILE namelen 6 datalen 0 name: foobar item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160 inode generation 7 transid 7 size 0 block group 0 mode 100644 links 1 item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16 inode ref index 2 namelen 6 name: foobar checksum tree key (CSUM_TREE ROOT_ITEM 0) leaf 29429760 items 0 free space 3995 generation 7 owner 7 fs uuid 6192815c-af2a-4b75-b3db-a959ffb6166e chunk uuid b529c44b-938c-4d3d-910a-013b4700bcae uuid tree key (UUID_TREE ROOT_ITEM 0) After this patch, the data loss no longer happens after a power failure and btrfs-debug-tree shows: $ btrfs-debug-tree /dev/sdb3 | egrep '\(257 INODE_ITEM 0\) itemoff' -B 3 -A 8 item 3 key (256 DIR_INDEX 2) itemoff 3751 itemsize 36 location key (257 INODE_ITEM 0) type FILE namelen 6 datalen 0 name: foobar item 4 key (257 INODE_ITEM 0) itemoff 3591 itemsize 160 inode generation 6 transid 6 size 6001 block group 0 mode 100644 links 1 item 5 key (257 INODE_REF 256) itemoff 3575 itemsize 16 inode ref index 2 namelen 6 name: foobar item 6 key (257 EXTENT_DATA 0) itemoff 3522 itemsize 53 extent data disk byte 12845056 nr 8192 extent data offset 0 nr 8192 ram 8192 extent compression 0 checksum tree key (CSUM_TREE ROOT_ITEM 0) Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com> Reviewed-by: Miao Xie <miaox@cn.fujitsu.com> Signed-off-by: Josef Bacik <jbacik@fusionio.com> Signed-off-by: Chris Mason <chris.mason@fusionio.com>
2013-09-23 18:35:11 +08:00
return ret;
}
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
case BTRFS_IOC_START_SYNC:
return btrfs_ioctl_start_sync(root, argp);
Btrfs: add START_SYNC, WAIT_SYNC ioctls START_SYNC will start a sync/commit, but not wait for it to complete. Any modification started after the ioctl returns is guaranteed not to be included in the commit. If a non-NULL pointer is passed, the transaction id will be returned to userspace. WAIT_SYNC will wait for any in-progress commit to complete. If a transaction id is specified, the ioctl will block and then return (success) when the specified transaction has committed. If it has already committed when we call the ioctl, it returns immediately. If the specified transaction doesn't exist, it returns EINVAL. If no transaction id is specified, WAIT_SYNC will wait for the currently committing transaction to finish it's commit to disk. If there is no currently committing transaction, it returns success. These ioctls are useful for applications which want to impose an ordering on when fs modifications reach disk, but do not want to wait for the full (slow) commit process to do so. Picky callers can take the transid returned by START_SYNC and feed it to WAIT_SYNC, and be certain to wait only as long as necessary for the transaction _they_ started to reach disk. Sloppy callers can START_SYNC and WAIT_SYNC without a transid, and provided they didn't wait too long between the calls, they will get the same result. However, if a second commit starts before they call WAIT_SYNC, they may end up waiting longer for it to commit as well. Even so, a START_SYNC+WAIT_SYNC still guarantees that any operation completed before the START_SYNC reaches disk. Signed-off-by: Sage Weil <sage@newdream.net> Signed-off-by: Chris Mason <chris.mason@oracle.com>
2010-10-30 03:41:32 +08:00
case BTRFS_IOC_WAIT_SYNC:
return btrfs_ioctl_wait_sync(fs_info, argp);
case BTRFS_IOC_SCRUB:
return btrfs_ioctl_scrub(file, argp);
case BTRFS_IOC_SCRUB_CANCEL:
return btrfs_ioctl_scrub_cancel(fs_info);
case BTRFS_IOC_SCRUB_PROGRESS:
return btrfs_ioctl_scrub_progress(fs_info, argp);
case BTRFS_IOC_BALANCE_V2:
return btrfs_ioctl_balance(file, argp);
case BTRFS_IOC_BALANCE_CTL:
return btrfs_ioctl_balance_ctl(fs_info, arg);
case BTRFS_IOC_BALANCE_PROGRESS:
return btrfs_ioctl_balance_progress(fs_info, argp);
case BTRFS_IOC_SET_RECEIVED_SUBVOL:
return btrfs_ioctl_set_received_subvol(file, argp);
#ifdef CONFIG_64BIT
case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
return btrfs_ioctl_set_received_subvol_32(file, argp);
#endif
case BTRFS_IOC_SEND:
return _btrfs_ioctl_send(file, argp, false);
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
case BTRFS_IOC_SEND_32:
return _btrfs_ioctl_send(file, argp, true);
#endif
case BTRFS_IOC_GET_DEV_STATS:
return btrfs_ioctl_get_dev_stats(fs_info, argp);
case BTRFS_IOC_QUOTA_CTL:
return btrfs_ioctl_quota_ctl(file, argp);
case BTRFS_IOC_QGROUP_ASSIGN:
return btrfs_ioctl_qgroup_assign(file, argp);
case BTRFS_IOC_QGROUP_CREATE:
return btrfs_ioctl_qgroup_create(file, argp);
case BTRFS_IOC_QGROUP_LIMIT:
return btrfs_ioctl_qgroup_limit(file, argp);
case BTRFS_IOC_QUOTA_RESCAN:
return btrfs_ioctl_quota_rescan(file, argp);
case BTRFS_IOC_QUOTA_RESCAN_STATUS:
return btrfs_ioctl_quota_rescan_status(fs_info, argp);
case BTRFS_IOC_QUOTA_RESCAN_WAIT:
return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
case BTRFS_IOC_DEV_REPLACE:
return btrfs_ioctl_dev_replace(fs_info, argp);
case BTRFS_IOC_GET_SUPPORTED_FEATURES:
return btrfs_ioctl_get_supported_features(argp);
case BTRFS_IOC_GET_FEATURES:
return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
case FS_IOC_FSGETXATTR:
return btrfs_ioctl_fsgetxattr(file, argp);
case FS_IOC_FSSETXATTR:
return btrfs_ioctl_fssetxattr(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
return btrfs_ioctl_get_subvol_info(file, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
return btrfs_ioctl_get_subvol_rootref(file, argp);
case BTRFS_IOC_INO_LOOKUP_USER:
return btrfs_ioctl_ino_lookup_user(file, argp);
}
return -ENOTTY;
}
#ifdef CONFIG_COMPAT
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
/*
* These all access 32-bit values anyway so no further
* handling is necessary.
*/
switch (cmd) {
case FS_IOC32_GETFLAGS:
cmd = FS_IOC_GETFLAGS;
break;
case FS_IOC32_SETFLAGS:
cmd = FS_IOC_SETFLAGS;
break;
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
}
return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
}
#endif