btrfs: offline dedupe

This patch adds an ioctl, BTRFS_IOC_FILE_EXTENT_SAME which will try to
de-duplicate a list of extents across a range of files.

Internally, the ioctl re-uses code from the clone ioctl. This avoids
rewriting a large chunk of extent handling code.

Userspace passes in an array of file, offset pairs along with a length
argument. The ioctl will then (for each dedupe) do a byte-by-byte comparison
of the user data before deduping the extent. Status and number of bytes
deduped are returned for each operation.

Signed-off-by: Mark Fasheh <mfasheh@suse.de>
Reviewed-by: Zach Brown <zab@redhat.com>
Signed-off-by: Josef Bacik <jbacik@fusionio.com>
Signed-off-by: Chris Mason <chris.mason@fusionio.com>
This commit is contained in:
Mark Fasheh 2013-08-06 11:42:51 -07:00 committed by Chris Mason
parent 4b384318a7
commit 416161db9b
2 changed files with 307 additions and 0 deletions

View File

@ -43,6 +43,7 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/uuid.h> #include <linux/uuid.h>
#include <linux/btrfs.h> #include <linux/btrfs.h>
#include <linux/uaccess.h>
#include "compat.h" #include "compat.h"
#include "ctree.h" #include "ctree.h"
#include "disk-io.h" #include "disk-io.h"
@ -57,6 +58,9 @@
#include "send.h" #include "send.h"
#include "dev-replace.h" #include "dev-replace.h"
static int btrfs_clone(struct inode *src, struct inode *inode,
u64 off, u64 olen, u64 olen_aligned, u64 destoff);
/* Mask out flags that are inappropriate for the given type of inode. */ /* Mask out flags that are inappropriate for the given type of inode. */
static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
{ {
@ -2470,6 +2474,34 @@ out:
return ret; return ret;
} }
static struct page *extent_same_get_page(struct inode *inode, u64 off)
{
struct page *page;
pgoff_t index;
struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
index = off >> PAGE_CACHE_SHIFT;
page = grab_cache_page(inode->i_mapping, index);
if (!page)
return NULL;
if (!PageUptodate(page)) {
if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
0))
return NULL;
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
page_cache_release(page);
return NULL;
}
}
unlock_page(page);
return page;
}
static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
{ {
/* do any pending delalloc/csum calc on src, one way or /* do any pending delalloc/csum calc on src, one way or
@ -2490,6 +2522,251 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
} }
} }
static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
mutex_unlock(&inode1->i_mutex);
mutex_unlock(&inode2->i_mutex);
}
static void btrfs_double_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
}
mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
lock_extent_range(inode1, loff1, len);
if (inode1 != inode2) {
mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
lock_extent_range(inode2, loff2, len);
}
}
static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
u64 dst_loff, u64 len)
{
int ret = 0;
struct page *src_page, *dst_page;
unsigned int cmp_len = PAGE_CACHE_SIZE;
void *addr, *dst_addr;
while (len) {
if (len < PAGE_CACHE_SIZE)
cmp_len = len;
src_page = extent_same_get_page(src, loff);
if (!src_page)
return -EINVAL;
dst_page = extent_same_get_page(dst, dst_loff);
if (!dst_page) {
page_cache_release(src_page);
return -EINVAL;
}
addr = kmap_atomic(src_page);
dst_addr = kmap_atomic(dst_page);
flush_dcache_page(src_page);
flush_dcache_page(dst_page);
if (memcmp(addr, dst_addr, cmp_len))
ret = BTRFS_SAME_DATA_DIFFERS;
kunmap_atomic(addr);
kunmap_atomic(dst_addr);
page_cache_release(src_page);
page_cache_release(dst_page);
if (ret)
break;
loff += cmp_len;
dst_loff += cmp_len;
len -= cmp_len;
}
return ret;
}
static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
{
u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
if (off + len > inode->i_size || off + len < off)
return -EINVAL;
/* Check that we are block aligned - btrfs_clone() requires this */
if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
return -EINVAL;
return 0;
}
static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
int ret;
/*
* btrfs_clone() can't handle extents in the same file
* yet. Once that works, we can drop this check and replace it
* with a check for the same inode, but overlapping extents.
*/
if (src == dst)
return -EINVAL;
btrfs_double_lock(src, loff, dst, dst_loff, len);
ret = extent_same_check_offsets(src, loff, len);
if (ret)
goto out_unlock;
ret = extent_same_check_offsets(dst, dst_loff, len);
if (ret)
goto out_unlock;
/* don't make the dst file partly checksummed */
if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
(BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
ret = -EINVAL;
goto out_unlock;
}
ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
if (ret == 0)
ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
out_unlock:
btrfs_double_unlock(src, loff, dst, dst_loff, len);
return ret;
}
#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024)
static long btrfs_ioctl_file_extent_same(struct file *file,
void __user *argp)
{
struct btrfs_ioctl_same_args *args = argp;
struct btrfs_ioctl_same_args same;
struct btrfs_ioctl_same_extent_info info;
struct inode *src = file->f_dentry->d_inode;
struct file *dst_file = NULL;
struct inode *dst;
u64 off;
u64 len;
int i;
int ret;
u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
bool is_admin = capable(CAP_SYS_ADMIN);
if (!(file->f_mode & FMODE_READ))
return -EINVAL;
ret = mnt_want_write_file(file);
if (ret)
return ret;
if (copy_from_user(&same,
(struct btrfs_ioctl_same_args __user *)argp,
sizeof(same))) {
ret = -EFAULT;
goto out;
}
off = same.logical_offset;
len = same.length;
/*
* Limit the total length we will dedupe for each operation.
* This is intended to bound the total time spent in this
* ioctl to something sane.
*/
if (len > BTRFS_MAX_DEDUPE_LEN)
len = BTRFS_MAX_DEDUPE_LEN;
if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
/*
* Btrfs does not support blocksize < page_size. As a
* result, btrfs_cmp_data() won't correctly handle
* this situation without an update.
*/
ret = -EINVAL;
goto out;
}
ret = -EISDIR;
if (S_ISDIR(src->i_mode))
goto out;
ret = -EACCES;
if (!S_ISREG(src->i_mode))
goto out;
ret = 0;
for (i = 0; i < same.dest_count; i++) {
if (copy_from_user(&info, &args->info[i], sizeof(info))) {
ret = -EFAULT;
goto out;
}
info.bytes_deduped = 0;
dst_file = fget(info.fd);
if (!dst_file) {
info.status = -EBADF;
goto next;
}
if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
info.status = -EINVAL;
goto next;
}
info.status = -EXDEV;
if (file->f_path.mnt != dst_file->f_path.mnt)
goto next;
dst = dst_file->f_dentry->d_inode;
if (src->i_sb != dst->i_sb)
goto next;
if (S_ISDIR(dst->i_mode)) {
info.status = -EISDIR;
goto next;
}
if (!S_ISREG(dst->i_mode)) {
info.status = -EACCES;
goto next;
}
info.status = btrfs_extent_same(src, off, len, dst,
info.logical_offset);
if (info.status == 0)
info.bytes_deduped += len;
next:
if (dst_file)
fput(dst_file);
if (__put_user_unaligned(info.status, &args->info[i].status) ||
__put_user_unaligned(info.bytes_deduped,
&args->info[i].bytes_deduped)) {
ret = -EFAULT;
goto out;
}
}
out:
mnt_drop_write_file(file);
return ret;
}
/** /**
* btrfs_clone() - clone a range from inode file to another * btrfs_clone() - clone a range from inode file to another
* *
@ -4242,6 +4519,8 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_fslabel(file, argp); return btrfs_ioctl_get_fslabel(file, argp);
case BTRFS_IOC_SET_FSLABEL: case BTRFS_IOC_SET_FSLABEL:
return btrfs_ioctl_set_fslabel(file, argp); return btrfs_ioctl_set_fslabel(file, argp);
case BTRFS_IOC_FILE_EXTENT_SAME:
return btrfs_ioctl_file_extent_same(file, argp);
} }
return -ENOTTY; return -ENOTTY;

View File

@ -305,6 +305,31 @@ struct btrfs_ioctl_clone_range_args {
#define BTRFS_DEFRAG_RANGE_COMPRESS 1 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
#define BTRFS_DEFRAG_RANGE_START_IO 2 #define BTRFS_DEFRAG_RANGE_START_IO 2
#define BTRFS_SAME_DATA_DIFFERS 1
/* For extent-same ioctl */
struct btrfs_ioctl_same_extent_info {
__s64 fd; /* in - destination file */
__u64 logical_offset; /* in - start of extent in destination */
__u64 bytes_deduped; /* out - total # of bytes we were able
* to dedupe from this file */
/* status of this dedupe operation:
* 0 if dedup succeeds
* < 0 for error
* == BTRFS_SAME_DATA_DIFFERS if data differs
*/
__s32 status; /* out - see above description */
__u32 reserved;
};
struct btrfs_ioctl_same_args {
__u64 logical_offset; /* in - start of extent in source */
__u64 length; /* in - length of extent */
__u16 dest_count; /* in - total elements in info array */
__u16 reserved1;
__u32 reserved2;
struct btrfs_ioctl_same_extent_info info[0];
};
struct btrfs_ioctl_space_info { struct btrfs_ioctl_space_info {
__u64 flags; __u64 flags;
__u64 total_bytes; __u64 total_bytes;
@ -579,4 +604,7 @@ static inline char *btrfs_err_str(enum btrfs_err_code err_code)
struct btrfs_ioctl_get_dev_stats) struct btrfs_ioctl_get_dev_stats)
#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \ #define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
struct btrfs_ioctl_dev_replace_args) struct btrfs_ioctl_dev_replace_args)
#define BTRFS_IOC_FILE_EXTENT_SAME _IOWR(BTRFS_IOCTL_MAGIC, 54, \
struct btrfs_ioctl_same_args)
#endif /* _UAPI_LINUX_BTRFS_H */ #endif /* _UAPI_LINUX_BTRFS_H */