2020-02-28 21:04:17 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
|
2020-02-28 21:04:19 +08:00
|
|
|
#include <linux/blkdev.h>
|
2020-02-28 21:04:17 +08:00
|
|
|
#include <linux/iversion.h>
|
2020-02-28 21:04:19 +08:00
|
|
|
#include "compression.h"
|
2020-02-28 21:04:17 +08:00
|
|
|
#include "ctree.h"
|
2020-02-28 21:04:19 +08:00
|
|
|
#include "delalloc-space.h"
|
2020-02-28 21:04:17 +08:00
|
|
|
#include "reflink.h"
|
|
|
|
#include "transaction.h"
|
|
|
|
|
|
|
|
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
|
|
|
|
|
|
|
|
static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
|
|
|
|
struct inode *inode,
|
|
|
|
u64 endoff,
|
|
|
|
const u64 destoff,
|
|
|
|
const u64 olen,
|
|
|
|
int no_time_update)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = BTRFS_I(inode)->root;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
inode_inc_iversion(inode);
|
|
|
|
if (!no_time_update)
|
|
|
|
inode->i_mtime = inode->i_ctime = current_time(inode);
|
|
|
|
/*
|
|
|
|
* We round up to the block size at eof when determining which
|
|
|
|
* extents to clone above, but shouldn't round up the file size.
|
|
|
|
*/
|
|
|
|
if (endoff > destoff + olen)
|
|
|
|
endoff = destoff + olen;
|
|
|
|
if (endoff > inode->i_size) {
|
|
|
|
i_size_write(inode, endoff);
|
|
|
|
btrfs_inode_safe_disk_i_size_write(inode, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
ret = btrfs_update_inode(trans, root, inode);
|
|
|
|
if (ret) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
ret = btrfs_end_transaction(trans);
|
|
|
|
out:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-08-31 19:42:47 +08:00
|
|
|
static int copy_inline_to_page(struct btrfs_inode *inode,
|
2020-02-28 21:04:19 +08:00
|
|
|
const u64 file_offset,
|
|
|
|
char *inline_data,
|
|
|
|
const u64 size,
|
|
|
|
const u64 datal,
|
|
|
|
const u8 comp_type)
|
|
|
|
{
|
2020-08-31 19:42:47 +08:00
|
|
|
const u64 block_size = btrfs_inode_sectorsize(inode);
|
2020-02-28 21:04:19 +08:00
|
|
|
const u64 range_end = file_offset + block_size - 1;
|
|
|
|
const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
|
|
|
|
char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
|
|
|
|
struct extent_changeset *data_reserved = NULL;
|
|
|
|
struct page *page = NULL;
|
2020-08-31 19:42:47 +08:00
|
|
|
struct address_space *mapping = inode->vfs_inode.i_mapping;
|
2020-02-28 21:04:19 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
ASSERT(IS_ALIGNED(file_offset, block_size));
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We have flushed and locked the ranges of the source and destination
|
|
|
|
* inodes, we also have locked the inodes, so we are safe to do a
|
|
|
|
* reservation here. Also we must not do the reservation while holding
|
|
|
|
* a transaction open, otherwise we would deadlock.
|
|
|
|
*/
|
2020-08-31 19:42:47 +08:00
|
|
|
ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
|
|
|
|
block_size);
|
2020-02-28 21:04:19 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
2020-08-31 19:42:47 +08:00
|
|
|
page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
|
|
|
|
btrfs_alloc_write_mask(mapping));
|
2020-02-28 21:04:19 +08:00
|
|
|
if (!page) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto out_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
set_page_extent_mapped(page);
|
2020-08-31 19:42:47 +08:00
|
|
|
clear_extent_bit(&inode->io_tree, file_offset, range_end,
|
2020-02-28 21:04:19 +08:00
|
|
|
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
|
|
|
|
0, 0, NULL);
|
2020-08-31 19:42:47 +08:00
|
|
|
ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
|
2020-02-28 21:04:19 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
if (comp_type == BTRFS_COMPRESS_NONE) {
|
|
|
|
char *map;
|
|
|
|
|
|
|
|
map = kmap(page);
|
|
|
|
memcpy(map, data_start, datal);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
} else {
|
|
|
|
ret = btrfs_decompress(comp_type, data_start, page, 0,
|
|
|
|
inline_size, datal);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
flush_dcache_page(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If our inline data is smaller then the block/page size, then the
|
|
|
|
* remaining of the block/page is equivalent to zeroes. We had something
|
|
|
|
* like the following done:
|
|
|
|
*
|
|
|
|
* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
|
|
|
|
* $ sync # (or fsync)
|
|
|
|
* $ xfs_io -c "falloc 0 4K" file
|
|
|
|
* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
|
|
|
|
*
|
|
|
|
* So what's in the range [500, 4095] corresponds to zeroes.
|
|
|
|
*/
|
|
|
|
if (datal < block_size) {
|
|
|
|
char *map;
|
|
|
|
|
|
|
|
map = kmap(page);
|
|
|
|
memset(map + datal, 0, block_size - datal);
|
|
|
|
flush_dcache_page(page);
|
|
|
|
kunmap(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
SetPageUptodate(page);
|
|
|
|
ClearPageChecked(page);
|
|
|
|
set_page_dirty(page);
|
|
|
|
out_unlock:
|
|
|
|
if (page) {
|
|
|
|
unlock_page(page);
|
|
|
|
put_page(page);
|
|
|
|
}
|
|
|
|
if (ret)
|
2020-08-31 19:42:47 +08:00
|
|
|
btrfs_delalloc_release_space(inode, data_reserved, file_offset,
|
|
|
|
block_size, true);
|
|
|
|
btrfs_delalloc_release_extents(inode, block_size);
|
2020-02-28 21:04:19 +08:00
|
|
|
out:
|
|
|
|
extent_changeset_free(data_reserved);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2020-02-28 21:04:17 +08:00
|
|
|
/*
|
2020-02-28 21:04:19 +08:00
|
|
|
* Deal with cloning of inline extents. We try to copy the inline extent from
|
|
|
|
* the source inode to destination inode when possible. When not possible we
|
|
|
|
* copy the inline extent's data into the respective page of the inode.
|
2020-02-28 21:04:17 +08:00
|
|
|
*/
|
|
|
|
static int clone_copy_inline_extent(struct inode *dst,
|
|
|
|
struct btrfs_path *path,
|
|
|
|
struct btrfs_key *new_key,
|
|
|
|
const u64 drop_start,
|
|
|
|
const u64 datal,
|
|
|
|
const u64 size,
|
2020-02-28 21:04:19 +08:00
|
|
|
const u8 comp_type,
|
|
|
|
char *inline_data,
|
|
|
|
struct btrfs_trans_handle **trans_out)
|
2020-02-28 21:04:17 +08:00
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
|
|
|
|
struct btrfs_root *root = BTRFS_I(dst)->root;
|
|
|
|
const u64 aligned_end = ALIGN(new_key->offset + datal,
|
|
|
|
fs_info->sectorsize);
|
2020-02-28 21:04:19 +08:00
|
|
|
struct btrfs_trans_handle *trans = NULL;
|
2020-11-04 19:07:32 +08:00
|
|
|
struct btrfs_drop_extents_args drop_args = { 0 };
|
2020-02-28 21:04:17 +08:00
|
|
|
int ret;
|
|
|
|
struct btrfs_key key;
|
|
|
|
|
2020-02-28 21:04:19 +08:00
|
|
|
if (new_key->offset > 0) {
|
2020-08-31 19:42:47 +08:00
|
|
|
ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
|
|
|
|
inline_data, size, datal, comp_type);
|
2020-02-28 21:04:19 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2020-02-28 21:04:17 +08:00
|
|
|
|
|
|
|
key.objectid = btrfs_ino(BTRFS_I(dst));
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = 0;
|
|
|
|
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
|
|
|
|
if (ret < 0) {
|
|
|
|
return ret;
|
|
|
|
} else if (ret > 0) {
|
|
|
|
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
|
|
|
|
ret = btrfs_next_leaf(root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
else if (ret > 0)
|
|
|
|
goto copy_inline_extent;
|
|
|
|
}
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
|
|
|
|
if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
|
|
|
|
key.type == BTRFS_EXTENT_DATA_KEY) {
|
2020-02-28 21:04:19 +08:00
|
|
|
/*
|
|
|
|
* There's an implicit hole at file offset 0, copy the
|
|
|
|
* inline extent's data to the page.
|
|
|
|
*/
|
2020-02-28 21:04:17 +08:00
|
|
|
ASSERT(key.offset > 0);
|
2020-08-31 19:42:47 +08:00
|
|
|
ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
|
2020-02-28 21:04:19 +08:00
|
|
|
inline_data, size, datal,
|
|
|
|
comp_type);
|
|
|
|
goto out;
|
2020-02-28 21:04:17 +08:00
|
|
|
}
|
|
|
|
} else if (i_size_read(dst) <= datal) {
|
|
|
|
struct btrfs_file_extent_item *ei;
|
|
|
|
|
|
|
|
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
|
|
|
|
struct btrfs_file_extent_item);
|
|
|
|
/*
|
2020-02-28 21:04:19 +08:00
|
|
|
* If it's an inline extent replace it with the source inline
|
|
|
|
* extent, otherwise copy the source inline extent data into
|
|
|
|
* the respective page at the destination inode.
|
2020-02-28 21:04:17 +08:00
|
|
|
*/
|
|
|
|
if (btrfs_file_extent_type(path->nodes[0], ei) ==
|
|
|
|
BTRFS_FILE_EXTENT_INLINE)
|
|
|
|
goto copy_inline_extent;
|
|
|
|
|
2020-08-31 19:42:47 +08:00
|
|
|
ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
|
|
|
|
inline_data, size, datal, comp_type);
|
2020-02-28 21:04:19 +08:00
|
|
|
goto out;
|
2020-02-28 21:04:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
copy_inline_extent:
|
2020-02-28 21:04:19 +08:00
|
|
|
ret = 0;
|
2020-02-28 21:04:17 +08:00
|
|
|
/*
|
|
|
|
* We have no extent items, or we have an extent at offset 0 which may
|
|
|
|
* or may not be inlined. All these cases are dealt the same way.
|
|
|
|
*/
|
|
|
|
if (i_size_read(dst) > datal) {
|
|
|
|
/*
|
2020-02-28 21:04:19 +08:00
|
|
|
* At the destination offset 0 we have either a hole, a regular
|
|
|
|
* extent or an inline extent larger then the one we want to
|
|
|
|
* clone. Deal with all these cases by copying the inline extent
|
|
|
|
* data into the respective page at the destination inode.
|
2020-02-28 21:04:17 +08:00
|
|
|
*/
|
2020-08-31 19:42:47 +08:00
|
|
|
ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
|
|
|
|
inline_data, size, datal, comp_type);
|
2020-02-28 21:04:19 +08:00
|
|
|
goto out;
|
2020-02-28 21:04:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
2020-02-28 21:04:19 +08:00
|
|
|
/*
|
|
|
|
* If we end up here it means were copy the inline extent into a leaf
|
|
|
|
* of the destination inode. We know we will drop or adjust at most one
|
|
|
|
* extent item in the destination root.
|
|
|
|
*
|
|
|
|
* 1 unit - adjusting old extent (we may have to split it)
|
|
|
|
* 1 unit - add new extent
|
|
|
|
* 1 unit - inode update
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 3);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
2020-11-04 19:07:32 +08:00
|
|
|
drop_args.path = path;
|
|
|
|
drop_args.start = drop_start;
|
|
|
|
drop_args.end = aligned_end;
|
|
|
|
drop_args.drop_cache = true;
|
|
|
|
ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args);
|
2020-02-28 21:04:17 +08:00
|
|
|
if (ret)
|
2020-02-28 21:04:19 +08:00
|
|
|
goto out;
|
2020-02-28 21:04:17 +08:00
|
|
|
ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
|
|
|
|
if (ret)
|
2020-02-28 21:04:19 +08:00
|
|
|
goto out;
|
2020-02-28 21:04:17 +08:00
|
|
|
|
|
|
|
write_extent_buffer(path->nodes[0], inline_data,
|
|
|
|
btrfs_item_ptr_offset(path->nodes[0],
|
|
|
|
path->slots[0]),
|
|
|
|
size);
|
btrfs: update the number of bytes used by an inode atomically
There are several occasions where we do not update the inode's number of
used bytes atomically, resulting in a concurrent stat(2) syscall to report
a value of used blocks that does not correspond to a valid value, that is,
a value that does not match neither what we had before the operation nor
what we get after the operation completes.
In extreme cases it can result in stat(2) reporting zero used blocks, which
can cause problems for some userspace tools where they can consider a file
with a non-zero size and zero used blocks as completely sparse and skip
reading data, as reported/discussed a long time ago in some threads like
the following:
https://lists.gnu.org/archive/html/bug-tar/2016-07/msg00001.html
The cases where this can happen are the following:
-> Case 1
If we do a write (buffered or direct IO) against a file region for which
there is already an allocated extent (or multiple extents), then we have a
short time window where we can report a number of used blocks to stat(2)
that does not take into account the file region being overwritten. This
short time window happens when completing the ordered extent(s).
This happens because when we drop the extents in the write range we
decrement the inode's number of bytes and later on when we insert the new
extent(s) we increment the number of bytes in the inode, resulting in a
short time window where a stat(2) syscall can get an incorrect number of
used blocks.
If we do writes that overwrite an entire file, then we have a short time
window where we report 0 used blocks to stat(2).
Example reproducer:
$ cat reproducer-1.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
xfs_io -f -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
expected=$(stat -c %b $MNT/foobar)
# Create a process to keep calling stat(2) on the file and see if the
# reported number of blocks used (disk space used) changes, it should
# not because we are not increasing the file size nor punching holes.
stat_loop $MNT/foobar $expected &
loop_pid=$!
for ((i = 0; i < 50000; i++)); do
xfs_io -s -c "pwrite -b 64K 0 64K" $MNT/foobar >/dev/null
done
kill $loop_pid &> /dev/null
wait
umount $DEV
$ ./reproducer-1.sh
ERROR: unexpected used blocks (got: 0 expected: 128)
ERROR: unexpected used blocks (got: 0 expected: 128)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 2
If we do a buffered write against a file region that does not have any
allocated extents, like a hole or beyond EOF, then during ordered extent
completion we have a short time window where a concurrent stat(2) syscall
can report a number of used blocks that does not correspond to the value
before or after the write operation, a value that is actually larger than
the value after the write completes.
This happens because once we start a buffered write into an unallocated
file range we increment the inode's 'new_delalloc_bytes', to make sure
any stat(2) call gets a correct used blocks value before delalloc is
flushed and completes. However at ordered extent completion, after we
inserted the new extent, we increment the inode's number of bytes used
with the size of the new extent, and only later, when clearing the range
in the inode's iotree, we decrement the inode's 'new_delalloc_bytes'
counter with the size of the extent. So this results in a short time
window where a concurrent stat(2) syscall can report a number of used
blocks that accounts for the new extent twice.
Example reproducer:
$ cat reproducer-2.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
stat_loop()
{
trap "wait; exit" SIGTERM
local filepath=$1
local expected=$2
local got
while :; do
got=$(stat -c %b $filepath)
if [ $got -ne $expected ]; then
echo -n "ERROR: unexpected used blocks"
echo " (got: $got expected: $expected)"
fi
done
}
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f $DEV > /dev/null
# mkfs.ext4 -F $DEV > /dev/null
# mkfs.f2fs -f $DEV > /dev/null
# mkfs.reiserfs -f $DEV > /dev/null
mount $DEV $MNT
touch $MNT/foobar
write_size=$((64 * 1024))
for ((i = 0; i < 16384; i++)); do
offset=$(($i * $write_size))
xfs_io -c "pwrite -S 0xab $offset $write_size" $MNT/foobar >/dev/null
blocks_used=$(stat -c %b $MNT/foobar)
# Fsync the file to trigger writeback and keep calling stat(2) on it
# to see if the number of blocks used changes.
stat_loop $MNT/foobar $blocks_used &
loop_pid=$!
xfs_io -c "fsync" $MNT/foobar
kill $loop_pid &> /dev/null
wait $loop_pid
done
umount $DEV
$ ./reproducer-2.sh
ERROR: unexpected used blocks (got: 265472 expected: 265344)
ERROR: unexpected used blocks (got: 284032 expected: 283904)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
-> Case 3
Another case where such problems happen is during other operations that
replace extents in a file range with other extents. Those operations are
extent cloning, deduplication and fallocate's zero range operation.
The cause of the problem is similar to the first case. When we drop the
extents from a range, we decrement the inode's number of bytes, and later
on, after inserting the new extents we increment it. Since this is not
done atomically, a concurrent stat(2) call can see and return a number of
used blocks that is smaller than it should be, does not match the number
of used blocks before or after the clone/deduplication/zero operation.
Like for the first case, when doing a clone, deduplication or zero range
operation against an entire file, we end up having a time window where we
can report 0 used blocks to a stat(2) call.
Example reproducer:
$ cat reproducer-3.sh
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
# mkfs.xfs -f -m reflink=1 $DEV > /dev/null
mount $DEV $MNT
extent_size=$((64 * 1024))
num_extents=16384
file_size=$(($extent_size * $num_extents))
# File foo has many small extents.
xfs_io -f -s -c "pwrite -S 0xab -b $extent_size 0 $file_size" $MNT/foo \
> /dev/null
# File bar has much less extents and has exactly the same data as foo.
xfs_io -f -c "pwrite -S 0xab 0 $file_size" $MNT/bar > /dev/null
expected=$(stat -c %b $MNT/foo)
# Now deduplicate bar into foo. While the deduplication is in progres,
# the number of used blocks/file size reported by stat should not change
xfs_io -c "dedupe $MNT/bar 0 0 $file_size" $MNT/foo > /dev/null &
dedupe_pid=$!
while [ -n "$(ps -p $dedupe_pid -o pid=)" ]; do
used=$(stat -c %b $MNT/foo)
if [ $used -ne $expected ]; then
echo "Unexpected blocks used: $used (expected: $expected)"
fi
done
umount $DEV
$ ./reproducer-3.sh
Unexpected blocks used: 2076800 (expected: 2097152)
Unexpected blocks used: 2097024 (expected: 2097152)
Unexpected blocks used: 2079872 (expected: 2097152)
(...)
Note that since this is a short time window where the race can happen, the
reproducer may not be able to always trigger the bug in one run, or it may
trigger it multiple times.
So fix this by:
1) Making btrfs_drop_extents() not decrement the VFS inode's number of
bytes, and instead return the number of bytes;
2) Making any code that drops extents and adds new extents update the
inode's number of bytes atomically, while holding the btrfs inode's
spinlock, which is also used by the stat(2) callback to get the inode's
number of bytes;
3) For ranges in the inode's iotree that are marked as 'delalloc new',
corresponding to previously unallocated ranges, increment the inode's
number of bytes when clearing the 'delalloc new' bit from the range,
in the same critical section that decrements the inode's
'new_delalloc_bytes' counter, delimited by the btrfs inode's spinlock.
An alternative would be to have btrfs_getattr() wait for any IO (ordered
extents in progress) and locking the whole range (0 to (u64)-1) while it
it computes the number of blocks used. But that would mean blocking
stat(2), which is a very used syscall and expected to be fast, waiting
for writes, clone/dedupe, fallocate, page reads, fiemap, etc.
CC: stable@vger.kernel.org # 5.4+
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-04 19:07:34 +08:00
|
|
|
btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found);
|
2020-02-28 21:04:17 +08:00
|
|
|
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
|
2020-04-05 04:20:22 +08:00
|
|
|
ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
|
2020-02-28 21:04:19 +08:00
|
|
|
out:
|
|
|
|
if (!ret && !trans) {
|
|
|
|
/*
|
|
|
|
* No transaction here means we copied the inline extent into a
|
|
|
|
* page of the destination inode.
|
|
|
|
*
|
|
|
|
* 1 unit to update inode item
|
|
|
|
*/
|
|
|
|
trans = btrfs_start_transaction(root, 1);
|
|
|
|
if (IS_ERR(trans)) {
|
|
|
|
ret = PTR_ERR(trans);
|
|
|
|
trans = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (ret && trans) {
|
|
|
|
btrfs_abort_transaction(trans, ret);
|
|
|
|
btrfs_end_transaction(trans);
|
|
|
|
}
|
|
|
|
if (!ret)
|
|
|
|
*trans_out = trans;
|
2020-02-28 21:04:17 +08:00
|
|
|
|
2020-02-28 21:04:19 +08:00
|
|
|
return ret;
|
2020-02-28 21:04:17 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
|
|
* btrfs_clone() - clone a range from inode file to another
|
|
|
|
*
|
|
|
|
* @src: Inode to clone from
|
|
|
|
* @inode: Inode to clone to
|
|
|
|
* @off: Offset within source to start clone from
|
|
|
|
* @olen: Original length, passed by user, of range to clone
|
|
|
|
* @olen_aligned: Block-aligned value of olen
|
|
|
|
* @destoff: Offset within @inode to start clone
|
|
|
|
* @no_time_update: Whether to update mtime/ctime on the target inode
|
|
|
|
*/
|
|
|
|
static int btrfs_clone(struct inode *src, struct inode *inode,
|
|
|
|
const u64 off, const u64 olen, const u64 olen_aligned,
|
|
|
|
const u64 destoff, int no_time_update)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
struct btrfs_path *path = NULL;
|
|
|
|
struct extent_buffer *leaf;
|
|
|
|
struct btrfs_trans_handle *trans;
|
|
|
|
char *buf = NULL;
|
|
|
|
struct btrfs_key key;
|
|
|
|
u32 nritems;
|
|
|
|
int slot;
|
|
|
|
int ret;
|
|
|
|
const u64 len = olen_aligned;
|
|
|
|
u64 last_dest_end = destoff;
|
|
|
|
|
|
|
|
ret = -ENOMEM;
|
|
|
|
buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
|
|
|
|
if (!buf)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
path = btrfs_alloc_path();
|
|
|
|
if (!path) {
|
|
|
|
kvfree(buf);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
path->reada = READA_FORWARD;
|
|
|
|
/* Clone data */
|
|
|
|
key.objectid = btrfs_ino(BTRFS_I(src));
|
|
|
|
key.type = BTRFS_EXTENT_DATA_KEY;
|
|
|
|
key.offset = off;
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
u64 next_key_min_offset = key.offset + 1;
|
|
|
|
struct btrfs_file_extent_item *extent;
|
btrfs: reduce contention on log trees when logging checksums
The possibility of extents being shared (through clone and deduplication
operations) requires special care when logging data checksums, to avoid
having a log tree with different checksum items that cover ranges which
overlap (which resulted in missing checksums after replaying a log tree).
Such problems were fixed in the past by the following commits:
commit 40e046acbd2f ("Btrfs: fix missing data checksums after replaying a
log tree")
commit e289f03ea79b ("btrfs: fix corrupt log due to concurrent fsync of
inodes with shared extents")
Test case generic/588 exercises the scenario solved by the first commit
(purely sequential and deterministic) while test case generic/457 often
triggered the case fixed by the second commit (not deterministic, requires
specific timings under concurrency).
The problems were addressed by deleting, from the log tree, any existing
checksums before logging the new ones. And also by doing the deletion and
logging of the cheksums while locking the checksum range in an extent io
tree (root->log_csum_range), to deal with the case where we have concurrent
fsyncs against files with shared extents.
That however causes more contention on the leaves of a log tree where we
store checksums (and all the nodes in the paths leading to them), even
when we do not have shared extents, or all the shared extents were created
by past transactions. It also adds a bit of contention on the spin lock of
the log_csums_range extent io tree of the log root.
This change adds a 'last_reflink_trans' field to the inode to keep track
of the last transaction where a new extent was shared between inodes
(through clone and deduplication operations). It is updated for both the
source and destination inodes of reflink operations whenever a new extent
(created in the current transaction) becomes shared by the inodes. This
field is kept in memory only, not persisted in the inode item, similar
to other existing fields (last_unlink_trans, logged_trans).
When logging checksums for an extent, if the value of 'last_reflink_trans'
is smaller then the current transaction's generation/id, we skip locking
the extent range and deletion of checksums from the log tree, since we
know we do not have new shared extents. This reduces contention on the
log tree's leaves where checksums are stored.
The following script, which uses fio, was used to measure the impact of
this change:
$ cat test-fsync.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 3 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=write
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=64k
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The tests were performed for different numbers of jobs, file sizes and
fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has
12 cores, with cpu governance set to performance mode on all cores), 16GiB
of ram (the host has 64GiB) and using a NVMe device directly (without an
intermediary filesystem in the host). While running the tests, the host
was not used for anything else, to avoid disturbing the tests.
The obtained results were the following (the last line of fio's output was
pasted). Starting with 16 jobs is where a significant difference is
observable in this particular setup and hardware (differences highlighted
below). The very small differences for tests with less than 16 jobs are
possibly just noise and random.
**** 1 job, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=23.8MiB/s (24.9MB/s), 23.8MiB/s-23.8MiB/s (24.9MB/s-24.9MB/s), io=1024MiB (1074MB), run=43075-43075msec
after this change:
WRITE: bw=24.4MiB/s (25.6MB/s), 24.4MiB/s-24.4MiB/s (25.6MB/s-25.6MB/s), io=1024MiB (1074MB), run=41938-41938msec
**** 2 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54351-54351msec
after this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.6MiB/s-37.6MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54428-54428msec
**** 4 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=67.5MiB/s (70.8MB/s), 67.5MiB/s-67.5MiB/s (70.8MB/s-70.8MB/s), io=4096MiB (4295MB), run=60669-60669msec
after this change:
WRITE: bw=68.6MiB/s (71.0MB/s), 68.6MiB/s-68.6MiB/s (71.0MB/s-71.0MB/s), io=4096MiB (4295MB), run=59678-59678msec
**** 8 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=128MiB/s (134MB/s), 128MiB/s-128MiB/s (134MB/s-134MB/s), io=8192MiB (8590MB), run=64048-64048msec
after this change:
WRITE: bw=129MiB/s (135MB/s), 129MiB/s-129MiB/s (135MB/s-135MB/s), io=8192MiB (8590MB), run=63405-63405msec
**** 16 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=78.5MiB/s (82.3MB/s), 78.5MiB/s-78.5MiB/s (82.3MB/s-82.3MB/s), io=16.0GiB (17.2GB), run=208676-208676msec
after this change:
WRITE: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s), io=16.0GiB (17.2GB), run=149295-149295msec
(+40.1% throughput, -28.5% runtime)
**** 32 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=58.8MiB/s (61.7MB/s), 58.8MiB/s-58.8MiB/s (61.7MB/s-61.7MB/s), io=32.0GiB (34.4GB), run=557134-557134msec
after this change:
WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430550-430550msec
(+29.4% throughput, -22.7% runtime)
**** 64 jobs, file size 512M, fsync frequency 1 ****
before this change:
WRITE: bw=65.8MiB/s (68.0MB/s), 65.8MiB/s-65.8MiB/s (68.0MB/s-68.0MB/s), io=32.0GiB (34.4GB), run=498055-498055msec
after this change:
WRITE: bw=85.1MiB/s (89.2MB/s), 85.1MiB/s-85.1MiB/s (89.2MB/s-89.2MB/s), io=32.0GiB (34.4GB), run=385116-385116msec
(+29.3% throughput, -22.7% runtime)
**** 128 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=54.7MiB/s (57.3MB/s), 54.7MiB/s-54.7MiB/s (57.3MB/s-57.3MB/s), io=32.0GiB (34.4GB), run=599373-599373msec
after this change:
WRITE: bw=121MiB/s (126MB/s), 121MiB/s-121MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=271907-271907msec
(+121.2% throughput, -54.6% runtime)
**** 256 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=69.2MiB/s (72.5MB/s), 69.2MiB/s-69.2MiB/s (72.5MB/s-72.5MB/s), io=64.0GiB (68.7GB), run=947536-947536msec
after this change:
WRITE: bw=121MiB/s (127MB/s), 121MiB/s-121MiB/s (127MB/s-127MB/s), io=64.0GiB (68.7GB), run=541916-541916msec
(+74.9% throughput, -42.8% runtime)
**** 512 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=85.4MiB/s (89.5MB/s), 85.4MiB/s-85.4MiB/s (89.5MB/s-89.5MB/s), io=64.0GiB (68.7GB), run=767734-767734msec
after this change:
WRITE: bw=141MiB/s (147MB/s), 141MiB/s-141MiB/s (147MB/s-147MB/s), io=64.0GiB (68.7GB), run=466022-466022msec
(+65.1% throughput, -39.3% runtime)
**** 1024 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=115MiB/s (120MB/s), 115MiB/s-115MiB/s (120MB/s-120MB/s), io=128GiB (137GB), run=1143775-1143775msec
after this change:
WRITE: bw=171MiB/s (180MB/s), 171MiB/s-171MiB/s (180MB/s-180MB/s), io=128GiB (137GB), run=764843-764843msec
(+48.7% throughput, -33.1% runtime)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-15 19:30:43 +08:00
|
|
|
u64 extent_gen;
|
2020-02-28 21:04:17 +08:00
|
|
|
int type;
|
|
|
|
u32 size;
|
|
|
|
struct btrfs_key new_key;
|
|
|
|
u64 disko = 0, diskl = 0;
|
|
|
|
u64 datao = 0, datal = 0;
|
2020-02-28 21:04:19 +08:00
|
|
|
u8 comp;
|
2020-02-28 21:04:17 +08:00
|
|
|
u64 drop_start;
|
|
|
|
|
|
|
|
/* Note the key will change type as we walk through the tree */
|
|
|
|
ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
|
|
|
|
0, 0);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
/*
|
|
|
|
* First search, if no extent item that starts at offset off was
|
|
|
|
* found but the previous item is an extent item, it's possible
|
|
|
|
* it might overlap our target range, therefore process it.
|
|
|
|
*/
|
|
|
|
if (key.offset == off && ret > 0 && path->slots[0] > 0) {
|
|
|
|
btrfs_item_key_to_cpu(path->nodes[0], &key,
|
|
|
|
path->slots[0] - 1);
|
|
|
|
if (key.type == BTRFS_EXTENT_DATA_KEY)
|
|
|
|
path->slots[0]--;
|
|
|
|
}
|
|
|
|
|
|
|
|
nritems = btrfs_header_nritems(path->nodes[0]);
|
|
|
|
process_slot:
|
|
|
|
if (path->slots[0] >= nritems) {
|
|
|
|
ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
if (ret > 0)
|
|
|
|
break;
|
|
|
|
nritems = btrfs_header_nritems(path->nodes[0]);
|
|
|
|
}
|
|
|
|
leaf = path->nodes[0];
|
|
|
|
slot = path->slots[0];
|
|
|
|
|
|
|
|
btrfs_item_key_to_cpu(leaf, &key, slot);
|
|
|
|
if (key.type > BTRFS_EXTENT_DATA_KEY ||
|
|
|
|
key.objectid != btrfs_ino(BTRFS_I(src)))
|
|
|
|
break;
|
|
|
|
|
|
|
|
ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
|
|
|
|
|
|
|
|
extent = btrfs_item_ptr(leaf, slot,
|
|
|
|
struct btrfs_file_extent_item);
|
btrfs: reduce contention on log trees when logging checksums
The possibility of extents being shared (through clone and deduplication
operations) requires special care when logging data checksums, to avoid
having a log tree with different checksum items that cover ranges which
overlap (which resulted in missing checksums after replaying a log tree).
Such problems were fixed in the past by the following commits:
commit 40e046acbd2f ("Btrfs: fix missing data checksums after replaying a
log tree")
commit e289f03ea79b ("btrfs: fix corrupt log due to concurrent fsync of
inodes with shared extents")
Test case generic/588 exercises the scenario solved by the first commit
(purely sequential and deterministic) while test case generic/457 often
triggered the case fixed by the second commit (not deterministic, requires
specific timings under concurrency).
The problems were addressed by deleting, from the log tree, any existing
checksums before logging the new ones. And also by doing the deletion and
logging of the cheksums while locking the checksum range in an extent io
tree (root->log_csum_range), to deal with the case where we have concurrent
fsyncs against files with shared extents.
That however causes more contention on the leaves of a log tree where we
store checksums (and all the nodes in the paths leading to them), even
when we do not have shared extents, or all the shared extents were created
by past transactions. It also adds a bit of contention on the spin lock of
the log_csums_range extent io tree of the log root.
This change adds a 'last_reflink_trans' field to the inode to keep track
of the last transaction where a new extent was shared between inodes
(through clone and deduplication operations). It is updated for both the
source and destination inodes of reflink operations whenever a new extent
(created in the current transaction) becomes shared by the inodes. This
field is kept in memory only, not persisted in the inode item, similar
to other existing fields (last_unlink_trans, logged_trans).
When logging checksums for an extent, if the value of 'last_reflink_trans'
is smaller then the current transaction's generation/id, we skip locking
the extent range and deletion of checksums from the log tree, since we
know we do not have new shared extents. This reduces contention on the
log tree's leaves where checksums are stored.
The following script, which uses fio, was used to measure the impact of
this change:
$ cat test-fsync.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 3 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=write
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=64k
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The tests were performed for different numbers of jobs, file sizes and
fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has
12 cores, with cpu governance set to performance mode on all cores), 16GiB
of ram (the host has 64GiB) and using a NVMe device directly (without an
intermediary filesystem in the host). While running the tests, the host
was not used for anything else, to avoid disturbing the tests.
The obtained results were the following (the last line of fio's output was
pasted). Starting with 16 jobs is where a significant difference is
observable in this particular setup and hardware (differences highlighted
below). The very small differences for tests with less than 16 jobs are
possibly just noise and random.
**** 1 job, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=23.8MiB/s (24.9MB/s), 23.8MiB/s-23.8MiB/s (24.9MB/s-24.9MB/s), io=1024MiB (1074MB), run=43075-43075msec
after this change:
WRITE: bw=24.4MiB/s (25.6MB/s), 24.4MiB/s-24.4MiB/s (25.6MB/s-25.6MB/s), io=1024MiB (1074MB), run=41938-41938msec
**** 2 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54351-54351msec
after this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.6MiB/s-37.6MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54428-54428msec
**** 4 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=67.5MiB/s (70.8MB/s), 67.5MiB/s-67.5MiB/s (70.8MB/s-70.8MB/s), io=4096MiB (4295MB), run=60669-60669msec
after this change:
WRITE: bw=68.6MiB/s (71.0MB/s), 68.6MiB/s-68.6MiB/s (71.0MB/s-71.0MB/s), io=4096MiB (4295MB), run=59678-59678msec
**** 8 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=128MiB/s (134MB/s), 128MiB/s-128MiB/s (134MB/s-134MB/s), io=8192MiB (8590MB), run=64048-64048msec
after this change:
WRITE: bw=129MiB/s (135MB/s), 129MiB/s-129MiB/s (135MB/s-135MB/s), io=8192MiB (8590MB), run=63405-63405msec
**** 16 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=78.5MiB/s (82.3MB/s), 78.5MiB/s-78.5MiB/s (82.3MB/s-82.3MB/s), io=16.0GiB (17.2GB), run=208676-208676msec
after this change:
WRITE: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s), io=16.0GiB (17.2GB), run=149295-149295msec
(+40.1% throughput, -28.5% runtime)
**** 32 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=58.8MiB/s (61.7MB/s), 58.8MiB/s-58.8MiB/s (61.7MB/s-61.7MB/s), io=32.0GiB (34.4GB), run=557134-557134msec
after this change:
WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430550-430550msec
(+29.4% throughput, -22.7% runtime)
**** 64 jobs, file size 512M, fsync frequency 1 ****
before this change:
WRITE: bw=65.8MiB/s (68.0MB/s), 65.8MiB/s-65.8MiB/s (68.0MB/s-68.0MB/s), io=32.0GiB (34.4GB), run=498055-498055msec
after this change:
WRITE: bw=85.1MiB/s (89.2MB/s), 85.1MiB/s-85.1MiB/s (89.2MB/s-89.2MB/s), io=32.0GiB (34.4GB), run=385116-385116msec
(+29.3% throughput, -22.7% runtime)
**** 128 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=54.7MiB/s (57.3MB/s), 54.7MiB/s-54.7MiB/s (57.3MB/s-57.3MB/s), io=32.0GiB (34.4GB), run=599373-599373msec
after this change:
WRITE: bw=121MiB/s (126MB/s), 121MiB/s-121MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=271907-271907msec
(+121.2% throughput, -54.6% runtime)
**** 256 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=69.2MiB/s (72.5MB/s), 69.2MiB/s-69.2MiB/s (72.5MB/s-72.5MB/s), io=64.0GiB (68.7GB), run=947536-947536msec
after this change:
WRITE: bw=121MiB/s (127MB/s), 121MiB/s-121MiB/s (127MB/s-127MB/s), io=64.0GiB (68.7GB), run=541916-541916msec
(+74.9% throughput, -42.8% runtime)
**** 512 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=85.4MiB/s (89.5MB/s), 85.4MiB/s-85.4MiB/s (89.5MB/s-89.5MB/s), io=64.0GiB (68.7GB), run=767734-767734msec
after this change:
WRITE: bw=141MiB/s (147MB/s), 141MiB/s-141MiB/s (147MB/s-147MB/s), io=64.0GiB (68.7GB), run=466022-466022msec
(+65.1% throughput, -39.3% runtime)
**** 1024 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=115MiB/s (120MB/s), 115MiB/s-115MiB/s (120MB/s-120MB/s), io=128GiB (137GB), run=1143775-1143775msec
after this change:
WRITE: bw=171MiB/s (180MB/s), 171MiB/s-171MiB/s (180MB/s-180MB/s), io=128GiB (137GB), run=764843-764843msec
(+48.7% throughput, -33.1% runtime)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-15 19:30:43 +08:00
|
|
|
extent_gen = btrfs_file_extent_generation(leaf, extent);
|
2020-02-28 21:04:19 +08:00
|
|
|
comp = btrfs_file_extent_compression(leaf, extent);
|
2020-02-28 21:04:17 +08:00
|
|
|
type = btrfs_file_extent_type(leaf, extent);
|
|
|
|
if (type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
type == BTRFS_FILE_EXTENT_PREALLOC) {
|
|
|
|
disko = btrfs_file_extent_disk_bytenr(leaf, extent);
|
|
|
|
diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
|
|
|
|
datao = btrfs_file_extent_offset(leaf, extent);
|
|
|
|
datal = btrfs_file_extent_num_bytes(leaf, extent);
|
|
|
|
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
|
|
|
|
/* Take upper bound, may be compressed */
|
|
|
|
datal = btrfs_file_extent_ram_bytes(leaf, extent);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The first search might have left us at an extent item that
|
|
|
|
* ends before our target range's start, can happen if we have
|
|
|
|
* holes and NO_HOLES feature enabled.
|
|
|
|
*/
|
|
|
|
if (key.offset + datal <= off) {
|
|
|
|
path->slots[0]++;
|
|
|
|
goto process_slot;
|
|
|
|
} else if (key.offset >= off + len) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
next_key_min_offset = key.offset + datal;
|
|
|
|
size = btrfs_item_size_nr(leaf, slot);
|
|
|
|
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
|
|
|
|
size);
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
|
|
|
memcpy(&new_key, &key, sizeof(new_key));
|
|
|
|
new_key.objectid = btrfs_ino(BTRFS_I(inode));
|
|
|
|
if (off <= key.offset)
|
|
|
|
new_key.offset = key.offset + destoff - off;
|
|
|
|
else
|
|
|
|
new_key.offset = destoff;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Deal with a hole that doesn't have an extent item that
|
|
|
|
* represents it (NO_HOLES feature enabled).
|
|
|
|
* This hole is either in the middle of the cloning range or at
|
|
|
|
* the beginning (fully overlaps it or partially overlaps it).
|
|
|
|
*/
|
|
|
|
if (new_key.offset != last_dest_end)
|
|
|
|
drop_start = last_dest_end;
|
|
|
|
else
|
|
|
|
drop_start = new_key.offset;
|
|
|
|
|
|
|
|
if (type == BTRFS_FILE_EXTENT_REG ||
|
|
|
|
type == BTRFS_FILE_EXTENT_PREALLOC) {
|
2020-09-08 18:27:22 +08:00
|
|
|
struct btrfs_replace_extent_info clone_info;
|
2020-02-28 21:04:17 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* a | --- range to clone ---| b
|
|
|
|
* | ------------- extent ------------- |
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* Subtract range b */
|
|
|
|
if (key.offset + datal > off + len)
|
|
|
|
datal = off + len - key.offset;
|
|
|
|
|
|
|
|
/* Subtract range a */
|
|
|
|
if (off > key.offset) {
|
|
|
|
datao += off - key.offset;
|
|
|
|
datal -= off - key.offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
clone_info.disk_offset = disko;
|
|
|
|
clone_info.disk_len = diskl;
|
|
|
|
clone_info.data_offset = datao;
|
|
|
|
clone_info.data_len = datal;
|
|
|
|
clone_info.file_offset = new_key.offset;
|
|
|
|
clone_info.extent_buf = buf;
|
btrfs: fix metadata reservation for fallocate that leads to transaction aborts
When doing an fallocate(), specially a zero range operation, we assume
that reserving 3 units of metadata space is enough, that at most we touch
one leaf in subvolume/fs tree for removing existing file extent items and
inserting a new file extent item. This assumption is generally true for
most common use cases. However when we end up needing to remove file extent
items from multiple leaves, we can end up failing with -ENOSPC and abort
the current transaction, turning the filesystem to RO mode. When this
happens a stack trace like the following is dumped in dmesg/syslog:
[ 1500.620934] ------------[ cut here ]------------
[ 1500.620938] BTRFS: Transaction aborted (error -28)
[ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...)
[ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1
[ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
[ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs]
[ 1500.621026] Code: 8b 40 50 f0 48 (...)
[ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286
[ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000
[ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff
[ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001
[ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000
[ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60
[ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000
[ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0
[ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[ 1500.621049] Call Trace:
[ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs]
[ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs]
[ 1500.621108] vfs_fallocate+0x14d/0x290
[ 1500.621112] ksys_fallocate+0x3a/0x70
[ 1500.621117] __x64_sys_fallocate+0x1a/0x20
[ 1500.621120] do_syscall_64+0x33/0x80
[ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 1500.621126] RIP: 0033:0x7fb5b248c477
[ 1500.621128] Code: 89 7c 24 08 (...)
[ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
[ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477
[ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003
[ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000
[ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010
[ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003
[ 1500.621151] irq event stamp: 1026217
[ 1500.621154] hardirqs last enabled at (1026223): [<ffffffffba965570>] console_unlock+0x500/0x5c0
[ 1500.621156] hardirqs last disabled at (1026228): [<ffffffffba9654c7>] console_unlock+0x457/0x5c0
[ 1500.621159] softirqs last enabled at (1022486): [<ffffffffbb6003dc>] __do_softirq+0x3dc/0x606
[ 1500.621161] softirqs last disabled at (1022477): [<ffffffffbb4010b2>] asm_call_on_stack+0x12/0x20
[ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]---
[ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left
When we use fallocate() internally, for reserving an extent for a space
cache, inode cache or relocation, we can't hit this problem since either
there aren't any file extent items to remove from the subvolume tree or
there is at most one.
When using plain fallocate() it's very unlikely, since that would require
having many file extent items representing holes for the target range and
crossing multiple leafs - we attempt to increase the range (merge) of such
file extent items when punching holes, so at most we end up with 2 file
extent items for holes at leaf boundaries.
However when using the zero range operation of fallocate() for a large
range (100+ MiB for example) that's fairly easy to trigger. The following
example reproducer triggers the issue:
$ cat reproducer.sh
#!/bin/bash
umount /dev/sdj &> /dev/null
mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null
mount /dev/sdj /mnt/sdj
# Create a 100M file with many file extent items. Punch a hole every 8K
# just to speedup the file creation - we could do 4K sequential writes
# followed by fsync (or O_SYNC) as well, but that takes a lot of time.
file_size=$((100 * 1024 * 1024))
xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar
for ((i = 0; i < $file_size; i += 8192)); do
xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar
done
# Force a transaction commit, so the zero range operation will be forced
# to COW all metadata extents it need to touch.
sync
xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar
umount /mnt/sdj
$ ./reproducer.sh
wrote 104857600/104857600 bytes at offset 0
100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec)
fallocate: No space left on device
$ dmesg
<shows the same stack trace pasted before>
To fix this use the existing infrastructure that hole punching and
extent cloning use for replacing a file range with another extent. This
deals with doing the removal of file extent items and inserting the new
one using an incremental approach, reserving more space when needed and
always ensuring we don't leave an implicit hole in the range in case
we need to do multiple iterations and a crash happens between iterations.
A test case for fstests will follow up soon.
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-09-08 18:27:20 +08:00
|
|
|
clone_info.is_new_extent = false;
|
2020-09-08 18:27:23 +08:00
|
|
|
ret = btrfs_replace_file_extents(inode, path, drop_start,
|
2020-02-28 21:04:17 +08:00
|
|
|
new_key.offset + datal - 1, &clone_info,
|
|
|
|
&trans);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
|
2020-02-28 21:04:18 +08:00
|
|
|
/*
|
|
|
|
* Inline extents always have to start at file offset 0
|
|
|
|
* and can never be bigger then the sector size. We can
|
|
|
|
* never clone only parts of an inline extent, since all
|
|
|
|
* reflink operations must start at a sector size aligned
|
|
|
|
* offset, and the length must be aligned too or end at
|
|
|
|
* the i_size (which implies the whole inlined data).
|
|
|
|
*/
|
|
|
|
ASSERT(key.offset == 0);
|
|
|
|
ASSERT(datal <= fs_info->sectorsize);
|
|
|
|
if (key.offset != 0 || datal > fs_info->sectorsize)
|
|
|
|
return -EUCLEAN;
|
2020-02-28 21:04:17 +08:00
|
|
|
|
2020-02-28 21:04:19 +08:00
|
|
|
ret = clone_copy_inline_extent(inode, path, &new_key,
|
|
|
|
drop_start, datal, size,
|
|
|
|
comp, buf, &trans);
|
|
|
|
if (ret)
|
2020-02-28 21:04:17 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
btrfs: reduce contention on log trees when logging checksums
The possibility of extents being shared (through clone and deduplication
operations) requires special care when logging data checksums, to avoid
having a log tree with different checksum items that cover ranges which
overlap (which resulted in missing checksums after replaying a log tree).
Such problems were fixed in the past by the following commits:
commit 40e046acbd2f ("Btrfs: fix missing data checksums after replaying a
log tree")
commit e289f03ea79b ("btrfs: fix corrupt log due to concurrent fsync of
inodes with shared extents")
Test case generic/588 exercises the scenario solved by the first commit
(purely sequential and deterministic) while test case generic/457 often
triggered the case fixed by the second commit (not deterministic, requires
specific timings under concurrency).
The problems were addressed by deleting, from the log tree, any existing
checksums before logging the new ones. And also by doing the deletion and
logging of the cheksums while locking the checksum range in an extent io
tree (root->log_csum_range), to deal with the case where we have concurrent
fsyncs against files with shared extents.
That however causes more contention on the leaves of a log tree where we
store checksums (and all the nodes in the paths leading to them), even
when we do not have shared extents, or all the shared extents were created
by past transactions. It also adds a bit of contention on the spin lock of
the log_csums_range extent io tree of the log root.
This change adds a 'last_reflink_trans' field to the inode to keep track
of the last transaction where a new extent was shared between inodes
(through clone and deduplication operations). It is updated for both the
source and destination inodes of reflink operations whenever a new extent
(created in the current transaction) becomes shared by the inodes. This
field is kept in memory only, not persisted in the inode item, similar
to other existing fields (last_unlink_trans, logged_trans).
When logging checksums for an extent, if the value of 'last_reflink_trans'
is smaller then the current transaction's generation/id, we skip locking
the extent range and deletion of checksums from the log tree, since we
know we do not have new shared extents. This reduces contention on the
log tree's leaves where checksums are stored.
The following script, which uses fio, was used to measure the impact of
this change:
$ cat test-fsync.sh
#!/bin/bash
DEV=/dev/sdk
MNT=/mnt/sdk
MOUNT_OPTIONS="-o ssd"
MKFS_OPTIONS="-d single -m single"
if [ $# -ne 3 ]; then
echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ"
exit 1
fi
NUM_JOBS=$1
FILE_SIZE=$2
FSYNC_FREQ=$3
cat <<EOF > /tmp/fio-job.ini
[writers]
rw=write
fsync=$FSYNC_FREQ
fallocate=none
group_reporting=1
direct=0
bs=64k
ioengine=sync
size=$FILE_SIZE
directory=$MNT
numjobs=$NUM_JOBS
EOF
echo "Using config:"
echo
cat /tmp/fio-job.ini
echo
mkfs.btrfs -f $MKFS_OPTIONS $DEV
mount $MOUNT_OPTIONS $DEV $MNT
fio /tmp/fio-job.ini
umount $MNT
The tests were performed for different numbers of jobs, file sizes and
fsync frequency. A qemu VM using kvm was used, with 8 cores (the host has
12 cores, with cpu governance set to performance mode on all cores), 16GiB
of ram (the host has 64GiB) and using a NVMe device directly (without an
intermediary filesystem in the host). While running the tests, the host
was not used for anything else, to avoid disturbing the tests.
The obtained results were the following (the last line of fio's output was
pasted). Starting with 16 jobs is where a significant difference is
observable in this particular setup and hardware (differences highlighted
below). The very small differences for tests with less than 16 jobs are
possibly just noise and random.
**** 1 job, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=23.8MiB/s (24.9MB/s), 23.8MiB/s-23.8MiB/s (24.9MB/s-24.9MB/s), io=1024MiB (1074MB), run=43075-43075msec
after this change:
WRITE: bw=24.4MiB/s (25.6MB/s), 24.4MiB/s-24.4MiB/s (25.6MB/s-25.6MB/s), io=1024MiB (1074MB), run=41938-41938msec
**** 2 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.7MiB/s-37.7MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54351-54351msec
after this change:
WRITE: bw=37.7MiB/s (39.5MB/s), 37.6MiB/s-37.6MiB/s (39.5MB/s-39.5MB/s), io=2048MiB (2147MB), run=54428-54428msec
**** 4 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=67.5MiB/s (70.8MB/s), 67.5MiB/s-67.5MiB/s (70.8MB/s-70.8MB/s), io=4096MiB (4295MB), run=60669-60669msec
after this change:
WRITE: bw=68.6MiB/s (71.0MB/s), 68.6MiB/s-68.6MiB/s (71.0MB/s-71.0MB/s), io=4096MiB (4295MB), run=59678-59678msec
**** 8 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=128MiB/s (134MB/s), 128MiB/s-128MiB/s (134MB/s-134MB/s), io=8192MiB (8590MB), run=64048-64048msec
after this change:
WRITE: bw=129MiB/s (135MB/s), 129MiB/s-129MiB/s (135MB/s-135MB/s), io=8192MiB (8590MB), run=63405-63405msec
**** 16 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=78.5MiB/s (82.3MB/s), 78.5MiB/s-78.5MiB/s (82.3MB/s-82.3MB/s), io=16.0GiB (17.2GB), run=208676-208676msec
after this change:
WRITE: bw=110MiB/s (115MB/s), 110MiB/s-110MiB/s (115MB/s-115MB/s), io=16.0GiB (17.2GB), run=149295-149295msec
(+40.1% throughput, -28.5% runtime)
**** 32 jobs, file size 1G, fsync frequency 1 ****
before this change:
WRITE: bw=58.8MiB/s (61.7MB/s), 58.8MiB/s-58.8MiB/s (61.7MB/s-61.7MB/s), io=32.0GiB (34.4GB), run=557134-557134msec
after this change:
WRITE: bw=76.1MiB/s (79.8MB/s), 76.1MiB/s-76.1MiB/s (79.8MB/s-79.8MB/s), io=32.0GiB (34.4GB), run=430550-430550msec
(+29.4% throughput, -22.7% runtime)
**** 64 jobs, file size 512M, fsync frequency 1 ****
before this change:
WRITE: bw=65.8MiB/s (68.0MB/s), 65.8MiB/s-65.8MiB/s (68.0MB/s-68.0MB/s), io=32.0GiB (34.4GB), run=498055-498055msec
after this change:
WRITE: bw=85.1MiB/s (89.2MB/s), 85.1MiB/s-85.1MiB/s (89.2MB/s-89.2MB/s), io=32.0GiB (34.4GB), run=385116-385116msec
(+29.3% throughput, -22.7% runtime)
**** 128 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=54.7MiB/s (57.3MB/s), 54.7MiB/s-54.7MiB/s (57.3MB/s-57.3MB/s), io=32.0GiB (34.4GB), run=599373-599373msec
after this change:
WRITE: bw=121MiB/s (126MB/s), 121MiB/s-121MiB/s (126MB/s-126MB/s), io=32.0GiB (34.4GB), run=271907-271907msec
(+121.2% throughput, -54.6% runtime)
**** 256 jobs, file size 256M, fsync frequency 1 ****
before this change:
WRITE: bw=69.2MiB/s (72.5MB/s), 69.2MiB/s-69.2MiB/s (72.5MB/s-72.5MB/s), io=64.0GiB (68.7GB), run=947536-947536msec
after this change:
WRITE: bw=121MiB/s (127MB/s), 121MiB/s-121MiB/s (127MB/s-127MB/s), io=64.0GiB (68.7GB), run=541916-541916msec
(+74.9% throughput, -42.8% runtime)
**** 512 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=85.4MiB/s (89.5MB/s), 85.4MiB/s-85.4MiB/s (89.5MB/s-89.5MB/s), io=64.0GiB (68.7GB), run=767734-767734msec
after this change:
WRITE: bw=141MiB/s (147MB/s), 141MiB/s-141MiB/s (147MB/s-147MB/s), io=64.0GiB (68.7GB), run=466022-466022msec
(+65.1% throughput, -39.3% runtime)
**** 1024 jobs, file size 128M, fsync frequency 1 ****
before this change:
WRITE: bw=115MiB/s (120MB/s), 115MiB/s-115MiB/s (120MB/s-120MB/s), io=128GiB (137GB), run=1143775-1143775msec
after this change:
WRITE: bw=171MiB/s (180MB/s), 171MiB/s-171MiB/s (180MB/s-180MB/s), io=128GiB (137GB), run=764843-764843msec
(+48.7% throughput, -33.1% runtime)
Reviewed-by: Josef Bacik <josef@toxicpanda.com>
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-07-15 19:30:43 +08:00
|
|
|
/*
|
|
|
|
* If this is a new extent update the last_reflink_trans of both
|
|
|
|
* inodes. This is used by fsync to make sure it does not log
|
|
|
|
* multiple checksum items with overlapping ranges. For older
|
|
|
|
* extents we don't need to do it since inode logging skips the
|
|
|
|
* checksums for older extents. Also ignore holes and inline
|
|
|
|
* extents because they don't have checksums in the csum tree.
|
|
|
|
*/
|
|
|
|
if (extent_gen == trans->transid && disko > 0) {
|
|
|
|
BTRFS_I(src)->last_reflink_trans = trans->transid;
|
|
|
|
BTRFS_I(inode)->last_reflink_trans = trans->transid;
|
|
|
|
}
|
|
|
|
|
2020-02-28 21:04:17 +08:00
|
|
|
last_dest_end = ALIGN(new_key.offset + datal,
|
|
|
|
fs_info->sectorsize);
|
|
|
|
ret = clone_finish_inode_update(trans, inode, last_dest_end,
|
|
|
|
destoff, olen, no_time_update);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
if (new_key.offset + datal >= destoff + len)
|
|
|
|
break;
|
|
|
|
|
|
|
|
btrfs_release_path(path);
|
|
|
|
key.offset = next_key_min_offset;
|
|
|
|
|
|
|
|
if (fatal_signal_pending(current)) {
|
|
|
|
ret = -EINTR;
|
|
|
|
goto out;
|
|
|
|
}
|
2020-09-22 16:27:29 +08:00
|
|
|
|
|
|
|
cond_resched();
|
2020-02-28 21:04:17 +08:00
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
|
|
|
|
if (last_dest_end < destoff + len) {
|
|
|
|
/*
|
|
|
|
* We have an implicit hole that fully or partially overlaps our
|
|
|
|
* cloning range at its end. This means that we either have the
|
|
|
|
* NO_HOLES feature enabled or the implicit hole happened due to
|
|
|
|
* mixing buffered and direct IO writes against this file.
|
|
|
|
*/
|
|
|
|
btrfs_release_path(path);
|
|
|
|
|
2020-09-08 18:27:23 +08:00
|
|
|
ret = btrfs_replace_file_extents(inode, path, last_dest_end,
|
2020-02-28 21:04:17 +08:00
|
|
|
destoff + len - 1, NULL, &trans);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = clone_finish_inode_update(trans, inode, destoff + len,
|
|
|
|
destoff, olen, no_time_update);
|
|
|
|
}
|
|
|
|
|
|
|
|
out:
|
|
|
|
btrfs_free_path(path);
|
|
|
|
kvfree(buf);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
|
|
|
|
struct inode *inode2, u64 loff2, u64 len)
|
|
|
|
{
|
|
|
|
unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
|
|
|
|
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
|
|
|
|
struct inode *inode2, u64 loff2, u64 len)
|
|
|
|
{
|
|
|
|
if (inode1 < inode2) {
|
|
|
|
swap(inode1, inode2);
|
|
|
|
swap(loff1, loff2);
|
|
|
|
} else if (inode1 == inode2 && loff2 < loff1) {
|
|
|
|
swap(loff1, loff2);
|
|
|
|
}
|
|
|
|
lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
|
|
|
|
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
|
|
|
|
struct inode *dst, u64 dst_loff)
|
|
|
|
{
|
|
|
|
const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock destination range to serialize with concurrent readpages() and
|
|
|
|
* source range to serialize with relocation.
|
|
|
|
*/
|
|
|
|
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
|
|
|
|
ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
|
|
|
|
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
|
|
|
|
struct inode *dst, u64 dst_loff)
|
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
u64 i, tail_len, chunk_count;
|
|
|
|
struct btrfs_root *root_dst = BTRFS_I(dst)->root;
|
|
|
|
|
|
|
|
spin_lock(&root_dst->root_item_lock);
|
|
|
|
if (root_dst->send_in_progress) {
|
|
|
|
btrfs_warn_rl(root_dst->fs_info,
|
|
|
|
"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
|
|
|
|
root_dst->root_key.objectid,
|
|
|
|
root_dst->send_in_progress);
|
|
|
|
spin_unlock(&root_dst->root_item_lock);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
|
|
|
root_dst->dedupe_in_progress++;
|
|
|
|
spin_unlock(&root_dst->root_item_lock);
|
|
|
|
|
|
|
|
tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
|
|
|
|
chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
|
|
|
|
|
|
|
|
for (i = 0; i < chunk_count; i++) {
|
|
|
|
ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
|
|
|
|
dst, dst_loff);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
loff += BTRFS_MAX_DEDUPE_LEN;
|
|
|
|
dst_loff += BTRFS_MAX_DEDUPE_LEN;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (tail_len > 0)
|
|
|
|
ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
|
|
|
|
out:
|
|
|
|
spin_lock(&root_dst->root_item_lock);
|
|
|
|
root_dst->dedupe_in_progress--;
|
|
|
|
spin_unlock(&root_dst->root_item_lock);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
|
|
|
|
u64 off, u64 olen, u64 destoff)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(file);
|
|
|
|
struct inode *src = file_inode(file_src);
|
|
|
|
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
|
|
|
int ret;
|
2020-02-28 21:04:19 +08:00
|
|
|
int wb_ret;
|
2020-02-28 21:04:17 +08:00
|
|
|
u64 len = olen;
|
|
|
|
u64 bs = fs_info->sb->s_blocksize;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* VFS's generic_remap_file_range_prep() protects us from cloning the
|
|
|
|
* eof block into the middle of a file, which would result in corruption
|
|
|
|
* if the file size is not blocksize aligned. So we don't need to check
|
|
|
|
* for that case here.
|
|
|
|
*/
|
|
|
|
if (off + len == src->i_size)
|
|
|
|
len = ALIGN(src->i_size, bs) - off;
|
|
|
|
|
|
|
|
if (destoff > inode->i_size) {
|
|
|
|
const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
|
|
|
|
|
|
|
|
ret = btrfs_cont_expand(inode, inode->i_size, destoff);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
/*
|
|
|
|
* We may have truncated the last block if the inode's size is
|
|
|
|
* not sector size aligned, so we need to wait for writeback to
|
|
|
|
* complete before proceeding further, otherwise we can race
|
|
|
|
* with cloning and attempt to increment a reference to an
|
|
|
|
* extent that no longer exists (writeback completed right after
|
|
|
|
* we found the previous extent covering eof and before we
|
|
|
|
* attempted to increment its reference count).
|
|
|
|
*/
|
|
|
|
ret = btrfs_wait_ordered_range(inode, wb_start,
|
|
|
|
destoff - wb_start);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Lock destination range to serialize with concurrent readpages() and
|
|
|
|
* source range to serialize with relocation.
|
|
|
|
*/
|
|
|
|
btrfs_double_extent_lock(src, off, inode, destoff, len);
|
|
|
|
ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
|
|
|
|
btrfs_double_extent_unlock(src, off, inode, destoff, len);
|
2020-02-28 21:04:19 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We may have copied an inline extent into a page of the destination
|
|
|
|
* range, so wait for writeback to complete before truncating pages
|
|
|
|
* from the page cache. This is a rare case.
|
|
|
|
*/
|
|
|
|
wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
|
|
|
|
ret = ret ? ret : wb_ret;
|
2020-02-28 21:04:17 +08:00
|
|
|
/*
|
|
|
|
* Truncate page cache pages so that future reads will see the cloned
|
|
|
|
* data immediately and not the previous data.
|
|
|
|
*/
|
|
|
|
truncate_inode_pages_range(&inode->i_data,
|
|
|
|
round_down(destoff, PAGE_SIZE),
|
|
|
|
round_up(destoff + len, PAGE_SIZE) - 1);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
|
|
|
|
struct file *file_out, loff_t pos_out,
|
|
|
|
loff_t *len, unsigned int remap_flags)
|
|
|
|
{
|
|
|
|
struct inode *inode_in = file_inode(file_in);
|
|
|
|
struct inode *inode_out = file_inode(file_out);
|
|
|
|
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
|
|
|
|
bool same_inode = inode_out == inode_in;
|
|
|
|
u64 wb_len;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!(remap_flags & REMAP_FILE_DEDUP)) {
|
|
|
|
struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
|
|
|
|
|
|
|
|
if (btrfs_root_readonly(root_out))
|
|
|
|
return -EROFS;
|
|
|
|
|
|
|
|
if (file_in->f_path.mnt != file_out->f_path.mnt ||
|
|
|
|
inode_in->i_sb != inode_out->i_sb)
|
|
|
|
return -EXDEV;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Don't make the dst file partly checksummed */
|
|
|
|
if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
|
|
|
|
(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Now that the inodes are locked, we need to start writeback ourselves
|
|
|
|
* and can not rely on the writeback from the VFS's generic helper
|
|
|
|
* generic_remap_file_range_prep() because:
|
|
|
|
*
|
|
|
|
* 1) For compression we must call filemap_fdatawrite_range() range
|
|
|
|
* twice (btrfs_fdatawrite_range() does it for us), and the generic
|
|
|
|
* helper only calls it once;
|
|
|
|
*
|
|
|
|
* 2) filemap_fdatawrite_range(), called by the generic helper only
|
|
|
|
* waits for the writeback to complete, i.e. for IO to be done, and
|
|
|
|
* not for the ordered extents to complete. We need to wait for them
|
|
|
|
* to complete so that new file extent items are in the fs tree.
|
|
|
|
*/
|
|
|
|
if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
|
|
|
|
wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
|
|
|
|
else
|
|
|
|
wb_len = ALIGN(*len, bs);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Since we don't lock ranges, wait for ongoing lockless dio writes (as
|
|
|
|
* any in progress could create its ordered extents after we wait for
|
|
|
|
* existing ordered extents below).
|
|
|
|
*/
|
|
|
|
inode_dio_wait(inode_in);
|
|
|
|
if (!same_inode)
|
|
|
|
inode_dio_wait(inode_out);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
|
|
|
|
*
|
|
|
|
* Btrfs' back references do not have a block level granularity, they
|
|
|
|
* work at the whole extent level.
|
|
|
|
* NOCOW buffered write without data space reserved may not be able
|
|
|
|
* to fall back to CoW due to lack of data space, thus could cause
|
|
|
|
* data loss.
|
|
|
|
*
|
|
|
|
* Here we take a shortcut by flushing the whole inode, so that all
|
|
|
|
* nocow write should reach disk as nocow before we increase the
|
|
|
|
* reference of the extent. We could do better by only flushing NOCOW
|
|
|
|
* data, but that needs extra accounting.
|
|
|
|
*
|
|
|
|
* Also we don't need to check ASYNC_EXTENT, as async extent will be
|
|
|
|
* CoWed anyway, not affecting nocow part.
|
|
|
|
*/
|
|
|
|
ret = filemap_flush(inode_in->i_mapping);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
|
|
|
|
wb_len);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
|
|
|
|
wb_len);
|
|
|
|
if (ret < 0)
|
|
|
|
return ret;
|
|
|
|
|
|
|
|
return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
|
|
|
|
len, remap_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
|
|
|
|
struct file *dst_file, loff_t destoff, loff_t len,
|
|
|
|
unsigned int remap_flags)
|
|
|
|
{
|
|
|
|
struct inode *src_inode = file_inode(src_file);
|
|
|
|
struct inode *dst_inode = file_inode(dst_file);
|
|
|
|
bool same_inode = dst_inode == src_inode;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (same_inode)
|
|
|
|
inode_lock(src_inode);
|
|
|
|
else
|
|
|
|
lock_two_nondirectories(src_inode, dst_inode);
|
|
|
|
|
|
|
|
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
|
|
|
|
&len, remap_flags);
|
|
|
|
if (ret < 0 || len == 0)
|
|
|
|
goto out_unlock;
|
|
|
|
|
|
|
|
if (remap_flags & REMAP_FILE_DEDUP)
|
|
|
|
ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
|
|
|
|
else
|
|
|
|
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
|
|
|
|
|
|
|
|
out_unlock:
|
|
|
|
if (same_inode)
|
|
|
|
inode_unlock(src_inode);
|
|
|
|
else
|
|
|
|
unlock_two_nondirectories(src_inode, dst_inode);
|
|
|
|
|
|
|
|
return ret < 0 ? ret : len;
|
|
|
|
}
|