ceph: re-org copy_file_range and fix some error paths

This patch re-organizes copy_file_range, trying to fix a few issues in the
error handling.  Here's the summary:

- Abort copy if initial do_splice_direct() returns fewer bytes than
  requested.

- Move the 'size' initialization (with i_size_read()) further down in the
  code, after the initial call to do_splice_direct().  This avoids issues
  with a possibly stale value if a manual copy is done.

- Move the object copy loop into a separate function.  This makes it
  easier to handle errors (e.g, dirtying caps and updating the MDS
  metadata if only some objects have been copied before an error has
  occurred).

- Added calls to ceph_oloc_destroy() to avoid leaking memory with src_oloc
  and dst_oloc

- After the object copy loop, the new file size to be reported to the MDS
  (if there's file size change) is now the actual file size, and not the
  size after an eventual extra manual copy.

- Added a few dout() to show the number of bytes copied in the two manual
  copies and in the object copy loop.

Signed-off-by: Luis Henriques <lhenriques@suse.com>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
This commit is contained in:
Luis Henriques 2020-02-24 13:44:32 +00:00 committed by Ilya Dryomov
parent 058daab79d
commit 1b0c3b9f91
1 changed files with 100 additions and 73 deletions

View File

@ -1944,6 +1944,71 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode,
return 0; return 0;
} }
static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off,
struct ceph_inode_info *dst_ci, u64 *dst_off,
struct ceph_fs_client *fsc,
size_t len, unsigned int flags)
{
struct ceph_object_locator src_oloc, dst_oloc;
struct ceph_object_id src_oid, dst_oid;
size_t bytes = 0;
u64 src_objnum, src_objoff, dst_objnum, dst_objoff;
u32 src_objlen, dst_objlen;
u32 object_size = src_ci->i_layout.object_size;
int ret;
src_oloc.pool = src_ci->i_layout.pool_id;
src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
dst_oloc.pool = dst_ci->i_layout.pool_id;
dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
while (len >= object_size) {
ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off,
object_size, &src_objnum,
&src_objoff, &src_objlen);
ceph_calc_file_object_mapping(&dst_ci->i_layout, *dst_off,
object_size, &dst_objnum,
&dst_objoff, &dst_objlen);
ceph_oid_init(&src_oid);
ceph_oid_printf(&src_oid, "%llx.%08llx",
src_ci->i_vino.ino, src_objnum);
ceph_oid_init(&dst_oid);
ceph_oid_printf(&dst_oid, "%llx.%08llx",
dst_ci->i_vino.ino, dst_objnum);
/* Do an object remote copy */
ret = ceph_osdc_copy_from(&fsc->client->osdc,
src_ci->i_vino.snap, 0,
&src_oid, &src_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
dst_ci->i_truncate_seq,
dst_ci->i_truncate_size,
CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (ret) {
if (ret == -EOPNOTSUPP) {
fsc->have_copy_from2 = false;
pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
}
dout("ceph_osdc_copy_from returned %d\n", ret);
if (!bytes)
bytes = ret;
goto out;
}
len -= object_size;
bytes += object_size;
*src_off += object_size;
*dst_off += object_size;
}
out:
ceph_oloc_destroy(&src_oloc);
ceph_oloc_destroy(&dst_oloc);
return bytes;
}
static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct file *dst_file, loff_t dst_off, struct file *dst_file, loff_t dst_off,
size_t len, unsigned int flags) size_t len, unsigned int flags)
@ -1954,14 +2019,11 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
struct ceph_inode_info *dst_ci = ceph_inode(dst_inode); struct ceph_inode_info *dst_ci = ceph_inode(dst_inode);
struct ceph_cap_flush *prealloc_cf; struct ceph_cap_flush *prealloc_cf;
struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode); struct ceph_fs_client *src_fsc = ceph_inode_to_client(src_inode);
struct ceph_object_locator src_oloc, dst_oloc; loff_t size;
struct ceph_object_id src_oid, dst_oid; ssize_t ret = -EIO, bytes;
loff_t endoff = 0, size;
ssize_t ret = -EIO;
u64 src_objnum, dst_objnum, src_objoff, dst_objoff; u64 src_objnum, dst_objnum, src_objoff, dst_objoff;
u32 src_objlen, dst_objlen, object_size; u32 src_objlen, dst_objlen;
int src_got = 0, dst_got = 0, err, dirty; int src_got = 0, dst_got = 0, err, dirty;
bool do_final_copy = false;
if (src_inode->i_sb != dst_inode->i_sb) { if (src_inode->i_sb != dst_inode->i_sb) {
struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode); struct ceph_fs_client *dst_fsc = ceph_inode_to_client(dst_inode);
@ -2039,22 +2101,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ret < 0) if (ret < 0)
goto out_caps; goto out_caps;
size = i_size_read(dst_inode);
endoff = dst_off + len;
/* Drop dst file cached pages */ /* Drop dst file cached pages */
ret = invalidate_inode_pages2_range(dst_inode->i_mapping, ret = invalidate_inode_pages2_range(dst_inode->i_mapping,
dst_off >> PAGE_SHIFT, dst_off >> PAGE_SHIFT,
endoff >> PAGE_SHIFT); (dst_off + len) >> PAGE_SHIFT);
if (ret < 0) { if (ret < 0) {
dout("Failed to invalidate inode pages (%zd)\n", ret); dout("Failed to invalidate inode pages (%zd)\n", ret);
ret = 0; /* XXX */ ret = 0; /* XXX */
} }
src_oloc.pool = src_ci->i_layout.pool_id;
src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns);
dst_oloc.pool = dst_ci->i_layout.pool_id;
dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns);
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off, ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
src_ci->i_layout.object_size, src_ci->i_layout.object_size,
&src_objnum, &src_objoff, &src_objlen); &src_objnum, &src_objoff, &src_objlen);
@ -2073,6 +2127,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* starting at the src_off * starting at the src_off
*/ */
if (src_objoff) { if (src_objoff) {
dout("Initial partial copy of %u bytes\n", src_objlen);
/* /*
* we need to temporarily drop all caps as we'll be calling * we need to temporarily drop all caps as we'll be calling
* {read,write}_iter, which will get caps again. * {read,write}_iter, which will get caps again.
@ -2080,8 +2136,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
ret = do_splice_direct(src_file, &src_off, dst_file, ret = do_splice_direct(src_file, &src_off, dst_file,
&dst_off, src_objlen, flags); &dst_off, src_objlen, flags);
if (ret < 0) { /* Abort on short copies or on error */
dout("do_splice_direct returned %d\n", err); if (ret < src_objlen) {
dout("Failed partial copy (%zd)\n", ret);
goto out; goto out;
} }
len -= ret; len -= ret;
@ -2094,62 +2151,29 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (err < 0) if (err < 0)
goto out_caps; goto out_caps;
} }
object_size = src_ci->i_layout.object_size;
while (len >= object_size) {
ceph_calc_file_object_mapping(&src_ci->i_layout, src_off,
object_size, &src_objnum,
&src_objoff, &src_objlen);
ceph_calc_file_object_mapping(&dst_ci->i_layout, dst_off,
object_size, &dst_objnum,
&dst_objoff, &dst_objlen);
ceph_oid_init(&src_oid);
ceph_oid_printf(&src_oid, "%llx.%08llx",
src_ci->i_vino.ino, src_objnum);
ceph_oid_init(&dst_oid);
ceph_oid_printf(&dst_oid, "%llx.%08llx",
dst_ci->i_vino.ino, dst_objnum);
/* Do an object remote copy */
err = ceph_osdc_copy_from(
&src_fsc->client->osdc,
src_ci->i_vino.snap, 0,
&src_oid, &src_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (err) {
if (err == -EOPNOTSUPP) {
src_fsc->have_copy_from2 = false;
pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
}
dout("ceph_osdc_copy_from returned %d\n", err);
if (!ret)
ret = err;
goto out_caps;
}
len -= object_size;
src_off += object_size;
dst_off += object_size;
ret += object_size;
}
if (len) size = i_size_read(dst_inode);
/* We still need one final local copy */ bytes = ceph_do_objects_copy(src_ci, &src_off, dst_ci, &dst_off,
do_final_copy = true; src_fsc, len, flags);
if (bytes <= 0) {
if (!ret)
ret = bytes;
goto out_caps;
}
dout("Copied %zu bytes out of %zu\n", bytes, len);
len -= bytes;
ret += bytes;
file_update_time(dst_file); file_update_time(dst_file);
inode_inc_iversion_raw(dst_inode); inode_inc_iversion_raw(dst_inode);
if (endoff > size) { if (dst_off > size) {
int caps_flags = 0; int caps_flags = 0;
/* Let the MDS know about dst file size change */ /* Let the MDS know about dst file size change */
if (ceph_quota_is_max_bytes_approaching(dst_inode, endoff)) if (ceph_quota_is_max_bytes_approaching(dst_inode, dst_off))
caps_flags |= CHECK_CAPS_NODELAY; caps_flags |= CHECK_CAPS_NODELAY;
if (ceph_inode_set_size(dst_inode, endoff)) if (ceph_inode_set_size(dst_inode, dst_off))
caps_flags |= CHECK_CAPS_AUTHONLY; caps_flags |= CHECK_CAPS_AUTHONLY;
if (caps_flags) if (caps_flags)
ceph_check_caps(dst_ci, caps_flags, NULL); ceph_check_caps(dst_ci, caps_flags, NULL);
@ -2165,15 +2189,18 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
out_caps: out_caps:
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got); put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
if (do_final_copy) { /*
err = do_splice_direct(src_file, &src_off, dst_file, * Do the final manual copy if we still have some bytes left, unless
&dst_off, len, flags); * there were errors in remote object copies (len >= object_size).
if (err < 0) { */
dout("do_splice_direct returned %d\n", err); if (len && (len < src_ci->i_layout.object_size)) {
goto out; dout("Final partial copy of %zu bytes\n", len);
} bytes = do_splice_direct(src_file, &src_off, dst_file,
len -= err; &dst_off, len, flags);
ret += err; if (bytes > 0)
ret += bytes;
else
dout("Failed partial copy (%zd)\n", bytes);
} }
out: out: