From f017754d6919f3bd9786710b07b4708293594f20 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 20 Jun 2018 15:42:55 +0800 Subject: [PATCH 01/40] ceph: add retry logic for error -ERANGE in ceph_get_acl() When the size of acl extended attribution is larger than pre-allocated value buffer size, we will hit error '-ERANGE' and it's probabaly caused by concurrent get/set acl from different clients. In this case, current logic just sets acl to NULL so that we cannot get proper information but the operation looks successful. This patch adds retry logic for error -ERANGE and return -EIO if fail from the retry. Additionally, print real errno when failing from __ceph_getxattr(). Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/acl.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 59cb307b15fb..eb9d567c9ef8 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -45,6 +45,7 @@ static inline void ceph_set_cached_acl(struct inode *inode, struct posix_acl *ceph_get_acl(struct inode *inode, int type) { int size; + unsigned int retry_cnt = 0; const char *name; char *value = NULL; struct posix_acl *acl; @@ -60,6 +61,7 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) BUG(); } +retry: size = __ceph_getxattr(inode, name, "", 0); if (size > 0) { value = kzalloc(size, GFP_NOFS); @@ -68,12 +70,22 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) size = __ceph_getxattr(inode, name, value, size); } - if (size > 0) + if (size == -ERANGE && retry_cnt < 10) { + retry_cnt++; + kfree(value); + value = NULL; + goto retry; + } + + if (size > 0) { acl = posix_acl_from_xattr(&init_user_ns, value, size); - else if (size == -ERANGE || size == -ENODATA || size == 0) + } else if (size == -ENODATA || size == 0) { acl = NULL; - else + } else { + pr_err_ratelimited("get acl %llx.%llx failed, err=%d\n", + ceph_vinop(inode), size); acl = ERR_PTR(-EIO); + } kfree(value); From 93d35c754d97a57bda2a2c6c39dce9b67a9f3c99 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Fri, 22 Jun 2018 15:01:13 +0800 Subject: [PATCH 02/40] ceph: restore ctime as well in the case of restoring old mode It's better to restore ctime as well in the case of restoring old mode in ceph_set_acl(). Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/acl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index eb9d567c9ef8..3351ea14390b 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -101,6 +101,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) const char *name = NULL; char *value = NULL; struct iattr newattrs; + struct timespec64 old_ctime = inode->i_ctime; umode_t new_mode = inode->i_mode, old_mode = inode->i_mode; switch (type) { @@ -145,7 +146,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (new_mode != old_mode) { newattrs.ia_ctime = current_time(inode); newattrs.ia_mode = new_mode; - newattrs.ia_valid = ATTR_MODE; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; ret = __ceph_setattr(inode, &newattrs); if (ret) goto out_free; @@ -154,8 +155,9 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) ret = __ceph_setxattr(inode, name, value, size, 0); if (ret) { if (new_mode != old_mode) { + newattrs.ia_ctime = old_ctime; newattrs.ia_mode = old_mode; - newattrs.ia_valid = ATTR_MODE; + newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; __ceph_setattr(inode, &newattrs); } goto out_free; From 6d54228fd1f293d00576ab2c3d2e4992c7cce12f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 25 Jun 2018 17:26:55 +0200 Subject: [PATCH 03/40] libceph: make ceph_osdc_notify{,_ack}() payload_len u32 The wire format dictates that payload_len fits into 4 bytes. Take u32 instead of size_t to reflect that. All callers pass a small integer, so no changes required. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 4 ++-- net/ceph/osd_client.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 0d6ee04b4c41..d6fe7e4df8cf 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -528,12 +528,12 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, u64 notify_id, u64 cookie, void *payload, - size_t payload_len); + u32 payload_len); int ceph_osdc_notify(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, void *payload, - size_t payload_len, + u32 payload_len, u32 timeout, struct page ***preply_pages, size_t *preply_len); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index a00c74f1154e..93614cb5e03f 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4591,7 +4591,7 @@ EXPORT_SYMBOL(ceph_osdc_unwatch); static int osd_req_op_notify_ack_init(struct ceph_osd_request *req, int which, u64 notify_id, u64 cookie, void *payload, - size_t payload_len) + u32 payload_len) { struct ceph_osd_req_op *op; struct ceph_pagelist *pl; @@ -4628,7 +4628,7 @@ int ceph_osdc_notify_ack(struct ceph_osd_client *osdc, u64 notify_id, u64 cookie, void *payload, - size_t payload_len) + u32 payload_len) { struct ceph_osd_request *req; int ret; @@ -4661,7 +4661,7 @@ EXPORT_SYMBOL(ceph_osdc_notify_ack); static int osd_req_op_notify_init(struct ceph_osd_request *req, int which, u64 cookie, u32 prot_ver, u32 timeout, - void *payload, size_t payload_len) + void *payload, u32 payload_len) { struct ceph_osd_req_op *op; struct ceph_pagelist *pl; @@ -4701,7 +4701,7 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc, struct ceph_object_id *oid, struct ceph_object_locator *oloc, void *payload, - size_t payload_len, + u32 payload_len, u32 timeout, struct page ***preply_pages, size_t *preply_len) From c9ed51c9123ab5e8f79b7d53a9afd786b43d4fe6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 27 Jun 2018 16:42:51 +0200 Subject: [PATCH 04/40] libceph: change ceph_pagelist_encode_string() to take u32 The wire format dictates that the length of string fits into 4 bytes. Take u32 instead of size_t to reflect that. We were already truncating len in ceph_pagelist_encode_32() -- this just pushes that truncation one level up. Signed-off-by: Ilya Dryomov --- include/linux/ceph/pagelist.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 7edcded07641..d0223364349f 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -68,7 +68,7 @@ static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v) return ceph_pagelist_append(pl, &v, 1); } static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl, - char *s, size_t len) + char *s, u32 len) { int ret = ceph_pagelist_encode_32(pl, len); if (ret) From 17173c82e3daa28742bf7732520e9bc2e0cb977d Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Wed, 27 Jun 2018 21:05:08 +0800 Subject: [PATCH 05/40] libceph: stop parsing when a bad int arg is detected There is no reason to continue option parsing after detecting bad option. [ Return match_int() errors from ceph_parse_options() to match the behaviour of parse_rbd_opts_token() and parse_fsopt_token(). ] Signed-off-by: Chengguang Xu Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/ceph_common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 584fdbef2088..1677a6132034 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -379,7 +379,7 @@ ceph_parse_options(char *options, const char *dev_name, /* parse mount options */ while ((c = strsep(&options, ",")) != NULL) { - int token, intval, ret; + int token, intval; if (!*c) continue; err = -EINVAL; @@ -394,11 +394,11 @@ ceph_parse_options(char *options, const char *dev_name, continue; } if (token < Opt_last_int) { - ret = match_int(&argstr[0], &intval); - if (ret < 0) { + err = match_int(&argstr[0], &intval); + if (err < 0) { pr_err("bad mount option arg (not int) " "at '%s'\n", c); - continue; + goto out; } dout("got int token %d val %d\n", token, intval); } else if (token > Opt_last_int && token < Opt_last_string) { From 2f56b6bae73b2d65ef4816ca89341facc53d3361 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 27 Jun 2018 16:38:13 +0200 Subject: [PATCH 06/40] libceph: amend "bad option arg" error message Don't mention "mount" -- in the rbd case it is "mapping". Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- fs/ceph/super.c | 3 +-- net/ceph/ceph_common.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fa0729c1e776..4e8949b88b05 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -786,7 +786,7 @@ static int parse_rbd_opts_token(char *c, void *private) if (token < Opt_last_int) { ret = match_int(&argstr[0], &intval); if (ret < 0) { - pr_err("bad mount option arg (not int) at '%s'\n", c); + pr_err("bad option arg (not int) at '%s'\n", c); return ret; } dout("got int token %d val %d\n", token, intval); diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 95a3b3ac9b6e..3d8a26b2944f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -219,8 +219,7 @@ static int parse_fsopt_token(char *c, void *private) if (token < Opt_last_int) { ret = match_int(&argstr[0], &intval); if (ret < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); + pr_err("bad option arg (not int) at '%s'\n", c); return ret; } dout("got int token %d val %d\n", token, intval); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 1677a6132034..6ebd7d953ee0 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -396,8 +396,7 @@ ceph_parse_options(char *options, const char *dev_name, if (token < Opt_last_int) { err = match_int(&argstr[0], &intval); if (err < 0) { - pr_err("bad mount option arg (not int) " - "at '%s'\n", c); + pr_err("bad option arg (not int) at '%s'\n", c); goto out; } dout("got int token %d val %d\n", token, intval); From c300156bc734796e251fa31b07dff2af2f572889 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 3 Jul 2018 15:28:43 +0200 Subject: [PATCH 07/40] rbd: pass rbd_spec into parse_rbd_opts_token() In preparation for _pool_ns client option, make rbd_spec available inside parse_rbd_opts_token(). Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 70 ++++++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 4e8949b88b05..df3fb58720c0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -776,9 +776,14 @@ struct rbd_options { #define RBD_EXCLUSIVE_DEFAULT false #define RBD_TRIM_DEFAULT true +struct parse_rbd_opts_ctx { + struct rbd_spec *spec; + struct rbd_options *opts; +}; + static int parse_rbd_opts_token(char *c, void *private) { - struct rbd_options *rbd_opts = private; + struct parse_rbd_opts_ctx *pctx = private; substring_t argstr[MAX_OPT_ARGS]; int token, intval, ret; @@ -802,7 +807,7 @@ static int parse_rbd_opts_token(char *c, void *private) pr_err("queue_depth out of range\n"); return -EINVAL; } - rbd_opts->queue_depth = intval; + pctx->opts->queue_depth = intval; break; case Opt_lock_timeout: /* 0 is "wait forever" (i.e. infinite timeout) */ @@ -810,22 +815,22 @@ static int parse_rbd_opts_token(char *c, void *private) pr_err("lock_timeout out of range\n"); return -EINVAL; } - rbd_opts->lock_timeout = msecs_to_jiffies(intval * 1000); + pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); break; case Opt_read_only: - rbd_opts->read_only = true; + pctx->opts->read_only = true; break; case Opt_read_write: - rbd_opts->read_only = false; + pctx->opts->read_only = false; break; case Opt_lock_on_read: - rbd_opts->lock_on_read = true; + pctx->opts->lock_on_read = true; break; case Opt_exclusive: - rbd_opts->exclusive = true; + pctx->opts->exclusive = true; break; case Opt_notrim: - rbd_opts->trim = false; + pctx->opts->trim = false; break; default: /* libceph prints "bad option" msg */ @@ -5146,8 +5151,7 @@ static int rbd_add_parse_args(const char *buf, const char *mon_addrs; char *snap_name; size_t mon_addrs_size; - struct rbd_spec *spec = NULL; - struct rbd_options *rbd_opts = NULL; + struct parse_rbd_opts_ctx pctx = { 0 }; struct ceph_options *copts; int ret; @@ -5171,22 +5175,22 @@ static int rbd_add_parse_args(const char *buf, goto out_err; } - spec = rbd_spec_alloc(); - if (!spec) + pctx.spec = rbd_spec_alloc(); + if (!pctx.spec) goto out_mem; - spec->pool_name = dup_token(&buf, NULL); - if (!spec->pool_name) + pctx.spec->pool_name = dup_token(&buf, NULL); + if (!pctx.spec->pool_name) goto out_mem; - if (!*spec->pool_name) { + if (!*pctx.spec->pool_name) { rbd_warn(NULL, "no pool name provided"); goto out_err; } - spec->image_name = dup_token(&buf, NULL); - if (!spec->image_name) + pctx.spec->image_name = dup_token(&buf, NULL); + if (!pctx.spec->image_name) goto out_mem; - if (!*spec->image_name) { + if (!*pctx.spec->image_name) { rbd_warn(NULL, "no image name provided"); goto out_err; } @@ -5207,24 +5211,24 @@ static int rbd_add_parse_args(const char *buf, if (!snap_name) goto out_mem; *(snap_name + len) = '\0'; - spec->snap_name = snap_name; + pctx.spec->snap_name = snap_name; /* Initialize all rbd options to the defaults */ - rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL); - if (!rbd_opts) + pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL); + if (!pctx.opts) goto out_mem; - rbd_opts->read_only = RBD_READ_ONLY_DEFAULT; - rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; - rbd_opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; - rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; - rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT; - rbd_opts->trim = RBD_TRIM_DEFAULT; + pctx.opts->read_only = RBD_READ_ONLY_DEFAULT; + pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT; + pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT; + pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT; + pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; + pctx.opts->trim = RBD_TRIM_DEFAULT; copts = ceph_parse_options(options, mon_addrs, - mon_addrs + mon_addrs_size - 1, - parse_rbd_opts_token, rbd_opts); + mon_addrs + mon_addrs_size - 1, + parse_rbd_opts_token, &pctx); if (IS_ERR(copts)) { ret = PTR_ERR(copts); goto out_err; @@ -5232,15 +5236,15 @@ static int rbd_add_parse_args(const char *buf, kfree(options); *ceph_opts = copts; - *opts = rbd_opts; - *rbd_spec = spec; + *opts = pctx.opts; + *rbd_spec = pctx.spec; return 0; out_mem: ret = -ENOMEM; out_err: - kfree(rbd_opts); - rbd_spec_put(spec); + kfree(pctx.opts); + rbd_spec_put(pctx.spec); kfree(options); return ret; From b26c047b940003295d3896b7f633a66aab95bebd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 3 Jul 2018 15:28:43 +0200 Subject: [PATCH 08/40] rbd: support for images within namespaces Cloning across namespaces isn't supported yet -- for now both the parent and the clone have to live in the same namespace, whether the default (i.e. "") or a user-created namespace. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 51 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index df3fb58720c0..c680de15fae0 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -181,6 +181,7 @@ struct rbd_image_header { struct rbd_spec { u64 pool_id; const char *pool_name; + const char *pool_ns; /* NULL if default, never "" */ const char *image_id; const char *image_name; @@ -735,6 +736,7 @@ enum { Opt_lock_timeout, Opt_last_int, /* int args above */ + Opt_pool_ns, Opt_last_string, /* string args above */ Opt_read_only, @@ -749,6 +751,7 @@ static match_table_t rbd_opts_tokens = { {Opt_queue_depth, "queue_depth=%d"}, {Opt_lock_timeout, "lock_timeout=%d"}, /* int args above */ + {Opt_pool_ns, "_pool_ns=%s"}, /* string args above */ {Opt_read_only, "read_only"}, {Opt_read_only, "ro"}, /* Alternate spelling */ @@ -817,6 +820,12 @@ static int parse_rbd_opts_token(char *c, void *private) } pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000); break; + case Opt_pool_ns: + kfree(pctx->spec->pool_ns); + pctx->spec->pool_ns = match_strdup(argstr); + if (!pctx->spec->pool_ns) + return -ENOMEM; + break; case Opt_read_only: pctx->opts->read_only = true; break; @@ -1480,7 +1489,13 @@ rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops) req->r_callback = rbd_osd_req_callback; req->r_priv = obj_req; + /* + * Data objects may be stored in a separate pool, but always in + * the same namespace in that pool as the header in its pool. + */ + ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc); req->r_base_oloc.pool = rbd_dev->layout.pool_id; + if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format, rbd_dev->header.object_prefix, obj_req->ex.oe_objno)) goto err_req; @@ -4124,6 +4139,14 @@ static ssize_t rbd_pool_id_show(struct device *dev, (unsigned long long) rbd_dev->spec->pool_id); } +static ssize_t rbd_pool_ns_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: ""); +} + static ssize_t rbd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -4222,6 +4245,7 @@ static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL); static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL); static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL); static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL); +static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL); static DEVICE_ATTR(name, 0444, rbd_name_show, NULL); static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL); static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh); @@ -4240,6 +4264,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_config_info.attr, &dev_attr_pool.attr, &dev_attr_pool_id.attr, + &dev_attr_pool_ns.attr, &dev_attr_name.attr, &dev_attr_image_id.attr, &dev_attr_current_snap.attr, @@ -4300,6 +4325,7 @@ static void rbd_spec_free(struct kref *kref) struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref); kfree(spec->pool_name); + kfree(spec->pool_ns); kfree(spec->image_id); kfree(spec->image_name); kfree(spec->snap_name); @@ -4358,6 +4384,12 @@ static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc, rbd_dev->header.data_pool_id = CEPH_NOPOOL; ceph_oid_init(&rbd_dev->header_oid); rbd_dev->header_oloc.pool = spec->pool_id; + if (spec->pool_ns) { + WARN_ON(!*spec->pool_ns); + rbd_dev->header_oloc.pool_ns = + ceph_find_or_create_string(spec->pool_ns, + strlen(spec->pool_ns)); + } mutex_init(&rbd_dev->watch_mutex); rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED; @@ -4638,6 +4670,17 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) parent_spec->pool_id = pool_id; parent_spec->image_id = image_id; parent_spec->snap_id = snap_id; + + /* TODO: support cloning across namespaces */ + if (rbd_dev->spec->pool_ns) { + parent_spec->pool_ns = kstrdup(rbd_dev->spec->pool_ns, + GFP_KERNEL); + if (!parent_spec->pool_ns) { + ret = -ENOMEM; + goto out_err; + } + } + rbd_dev->parent_spec = parent_spec; parent_spec = NULL; /* rbd_dev now owns this */ } else { @@ -5590,8 +5633,10 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ret = rbd_register_watch(rbd_dev); if (ret) { if (ret == -ENOENT) - pr_info("image %s/%s does not exist\n", + pr_info("image %s/%s%s%s does not exist\n", rbd_dev->spec->pool_name, + rbd_dev->spec->pool_ns ?: "", + rbd_dev->spec->pool_ns ? "/" : "", rbd_dev->spec->image_name); goto err_out_format; } @@ -5613,8 +5658,10 @@ static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth) ret = rbd_spec_fill_names(rbd_dev); if (ret) { if (ret == -ENOENT) - pr_info("snap %s/%s@%s does not exist\n", + pr_info("snap %s/%s%s%s@%s does not exist\n", rbd_dev->spec->pool_name, + rbd_dev->spec->pool_ns ?: "", + rbd_dev->spec->pool_ns ? "/" : "", rbd_dev->spec->image_name, rbd_dev->spec->snap_name); goto err_out_probe; From dfeb84d4adfda0448b013b8aae2cc5c95eb27ccd Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 2 Jul 2018 15:55:23 +0800 Subject: [PATCH 09/40] ceph: fix incorrect use of strncpy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC8 prints following warning: fs/ceph/mds_client.c:3683:2: warning: ‘strncpy’ output may be truncated copying 64 bytes from a string of length 64 [-Wstringop-truncation] [ Change to strscpy() while at it. ] Signed-off-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index dc8bc664a871..f1590256d2e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3644,8 +3644,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) init_rwsem(&mdsc->pool_perm_rwsem); mdsc->pool_perm_tree = RB_ROOT; - strncpy(mdsc->nodename, utsname()->nodename, - sizeof(mdsc->nodename) - 1); + strscpy(mdsc->nodename, utsname()->nodename, + sizeof(mdsc->nodename)); return 0; } From 61ad36d47dd273b7b8c3d63fc8359d96f7976c79 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 9 Jul 2018 22:48:07 +0800 Subject: [PATCH 10/40] ceph: return errors from posix_acl_equiv_mode() correctly In order to return correct error code should replace variable ret using err in error case. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/acl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 3351ea14390b..027408d55aee 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -185,10 +185,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, return err; if (acl) { - int ret = posix_acl_equiv_mode(acl, mode); - if (ret < 0) + err = posix_acl_equiv_mode(acl, mode); + if (err < 0) goto out_err; - if (ret == 0) { + if (err == 0) { posix_acl_release(acl); acl = NULL; } From 0459871c4995d0bd57d6b1694f0971ec5cabafc1 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 9 Jul 2018 22:17:30 +0800 Subject: [PATCH 11/40] ceph: add d_drop for some error cases in ceph_mknod() When file num exceeds quota limit or fails from ceph_per_init_acls() should call d_drop to drop dentry from cache as well. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 036ac0f3a393..4b6a49fd2a61 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -827,12 +827,14 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - if (ceph_quota_is_max_files_exceeded(dir)) - return -EDQUOT; + if (ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } err = ceph_pre_init_acls(dir, &mode, &acls); if (err < 0) - return err; + goto out; dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); From 67fcd151400242757edbab710402161b2cd38989 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 9 Jul 2018 22:17:31 +0800 Subject: [PATCH 12/40] ceph: add d_drop for some error cases in ceph_symlink() When file num exceeds quota limit, should call d_drop to drop dentry from cache as well. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/dir.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4b6a49fd2a61..0210b02b86d4 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -885,8 +885,10 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; - if (ceph_quota_is_max_files_exceeded(dir)) - return -EDQUOT; + if (ceph_quota_is_max_files_exceeded(dir)) { + err = -EDQUOT; + goto out; + } dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); From 473bd2d780d1699d81b25f57c0ec4de633a28eb8 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:34 +0200 Subject: [PATCH 13/40] libceph: use timespec64 in for keepalive2 and ticket validity ceph_con_keepalive_expired() is the last user of timespec_add() and some of the last uses of ktime_get_real_ts(). Replacing this with timespec64 based interfaces lets us remove that deprecated API. I'm introducing new ceph_encode_timespec64()/ceph_decode_timespec64() here that take timespec64 structures and convert to/from ceph_timespec, which is defined to have an unsigned 32-bit tv_sec member. This extends the range of valid times to year 2106, avoiding the year 2038 overflow. The ceph file system portion still uses the old functions for inode timestamps, this will be done separately after the VFS layer is converted. Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- include/linux/ceph/decode.h | 20 +++++++++++++++++++- include/linux/ceph/messenger.h | 2 +- net/ceph/auth_x.c | 14 +++++++------- net/ceph/auth_x.h | 2 +- net/ceph/cls_lock_client.c | 4 ++-- net/ceph/messenger.c | 20 ++++++++++---------- 6 files changed, 40 insertions(+), 22 deletions(-) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index d143ac8879c6..094b9b4a34f3 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -194,8 +194,26 @@ ceph_decode_skip_n(p, end, sizeof(u8), bad) } while (0) /* - * struct ceph_timespec <-> struct timespec + * struct ceph_timespec <-> struct timespec64 */ +static inline void ceph_decode_timespec64(struct timespec64 *ts, + const struct ceph_timespec *tv) +{ + /* + * This will still overflow in year 2106. We could extend + * the protocol to steal two more bits from tv_nsec to + * add three more 136 year epochs after that the way ext4 + * does if necessary. + */ + ts->tv_sec = (time64_t)le32_to_cpu(tv->tv_sec); + ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); +} +static inline void ceph_encode_timespec64(struct ceph_timespec *tv, + const struct timespec64 *ts) +{ + tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); + tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); +} static inline void ceph_decode_timespec(struct timespec *ts, const struct ceph_timespec *tv) { diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index c7dfcb8a1fb2..a718b877c597 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -330,7 +330,7 @@ struct ceph_connection { int in_base_pos; /* bytes read */ __le64 in_temp_ack; /* for reading an ack */ - struct timespec last_keepalive_ack; /* keepalive2 ack stamp */ + struct timespec64 last_keepalive_ack; /* keepalive2 ack stamp */ struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 2f4a1baf5f52..b05c3a540a5a 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -149,12 +149,12 @@ static int process_one_ticket(struct ceph_auth_client *ac, void *dp, *dend; int dlen; char is_enc; - struct timespec validity; + struct timespec64 validity; void *tp, *tpend; void **ptp; struct ceph_crypto_key new_session_key = { 0 }; struct ceph_buffer *new_ticket_blob; - unsigned long new_expires, new_renew_after; + time64_t new_expires, new_renew_after; u64 new_secret_id; int ret; @@ -189,11 +189,11 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ceph_decode_timespec(&validity, dp); + ceph_decode_timespec64(&validity, dp); dp += sizeof(struct ceph_timespec); - new_expires = get_seconds() + validity.tv_sec; + new_expires = ktime_get_real_seconds() + validity.tv_sec; new_renew_after = new_expires - (validity.tv_sec / 4); - dout(" expires=%lu renew_after=%lu\n", new_expires, + dout(" expires=%llu renew_after=%llu\n", new_expires, new_renew_after); /* ticket blob for service */ @@ -385,13 +385,13 @@ static bool need_key(struct ceph_x_ticket_handler *th) if (!th->have_key) return true; - return get_seconds() >= th->renew_after; + return ktime_get_real_seconds() >= th->renew_after; } static bool have_key(struct ceph_x_ticket_handler *th) { if (th->have_key) { - if (get_seconds() >= th->expires) + if (ktime_get_real_seconds() >= th->expires) th->have_key = false; } diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index 454cb54568af..57ba99f7736f 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -22,7 +22,7 @@ struct ceph_x_ticket_handler { u64 secret_id; struct ceph_buffer *ticket_blob; - unsigned long renew_after, expires; + time64_t renew_after, expires; }; #define CEPHX_AU_ENC_BUF_LEN 128 /* big enough for encrypted blob */ diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index 8d2032b2f225..2105a6eaa66c 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -32,7 +32,7 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, int desc_len = strlen(desc); void *p, *end; struct page *lock_op_page; - struct timespec mtime; + struct timespec64 mtime; int ret; lock_op_buf_size = name_len + sizeof(__le32) + @@ -63,7 +63,7 @@ int ceph_cls_lock(struct ceph_osd_client *osdc, ceph_encode_string(&p, end, desc, desc_len); /* only support infinite duration */ memset(&mtime, 0, sizeof(mtime)); - ceph_encode_timespec(p, &mtime); + ceph_encode_timespec64(p, &mtime); p += sizeof(struct ceph_timespec); ceph_encode_8(&p, flags); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index c6413c360771..3f6336248509 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1417,11 +1417,11 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec now; + struct timespec64 now; - ktime_get_real_ts(&now); + ktime_get_real_ts64(&now); con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); - ceph_encode_timespec(&con->out_temp_keepalive2, &now); + ceph_encode_timespec64(&con->out_temp_keepalive2, &now); con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), &con->out_temp_keepalive2); } else { @@ -2555,7 +2555,7 @@ static int read_keepalive_ack(struct ceph_connection *con) int ret = read_partial(con, size, size, &ceph_ts); if (ret <= 0) return ret; - ceph_decode_timespec(&con->last_keepalive_ack, &ceph_ts); + ceph_decode_timespec64(&con->last_keepalive_ack, &ceph_ts); prepare_read_tag(con); return 1; } @@ -3223,12 +3223,12 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { - struct timespec now; - struct timespec ts; - ktime_get_real_ts(&now); - jiffies_to_timespec(interval, &ts); - ts = timespec_add(con->last_keepalive_ack, ts); - return timespec_compare(&now, &ts) >= 0; + struct timespec64 now; + struct timespec64 ts; + ktime_get_real_ts64(&now); + jiffies_to_timespec64(interval, &ts); + ts = timespec64_add(con->last_keepalive_ack, ts); + return timespec64_compare(&now, &ts) >= 0; } return false; } From 63ecae7e439f766e88c91da56955bca4db06802a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:35 +0200 Subject: [PATCH 14/40] ceph: stop using current_kernel_time() ceph_mdsc_create_request() is one of the last callers of the deprecated current_kernel_time() as well as timespec_trunc(). This changes it to use the timespec64 based interfaces instead, though we still need to convert the result until we are ready to change over req->r_stamp. The output of the two functions, ktime_get_coarse_real_ts64() and current_kernel_time() is the same coarse-granular timestamp, the only difference here is that ktime_get_coarse_real_ts64() doesn't overflow in 2038. Signed-off-by: Arnd Bergmann Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f1590256d2e6..d2679dce7332 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1779,6 +1779,7 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + struct timespec64 ts; if (!req) return ERR_PTR(-ENOMEM); @@ -1797,7 +1798,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - req->r_stamp = timespec_trunc(current_kernel_time(), mdsc->fsc->sb->s_time_gran); + ktime_get_coarse_real_ts64(&ts); + req->r_stamp = timespec64_to_timespec(timespec64_trunc(ts, + mdsc->fsc->sb->s_time_gran)); req->r_op = op; req->r_direct_mode = mode; From 9bbeab41ce50542624ef381e7852d70f2f39a2b1 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:36 +0200 Subject: [PATCH 15/40] ceph: use timespec64 for inode timestamp Since the vfs structures are all using timespec64, we can now change the internal representation, using ceph_encode_timespec64 and ceph_decode_timespec64. In case of ceph_aux_inode however, we need to avoid doing a memcmp() on uninitialized padding data, so the members of the i_mtime field get copied individually into 64-bit integers. Signed-off-by: Arnd Bergmann Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/cache.c | 11 ++++--- fs/ceph/caps.c | 26 +++++++-------- fs/ceph/dir.c | 6 ++-- fs/ceph/inode.c | 76 +++++++++++++++++++++----------------------- fs/ceph/mds_client.c | 7 ++-- fs/ceph/snap.c | 6 ++-- fs/ceph/super.h | 9 +++--- fs/ceph/xattr.c | 4 +-- 8 files changed, 71 insertions(+), 74 deletions(-) diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 362900e42424..1bf3502bdd6f 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -25,8 +25,9 @@ #include "cache.h" struct ceph_aux_inode { - u64 version; - struct timespec mtime; + u64 version; + u64 mtime_sec; + u64 mtime_nsec; }; struct fscache_netfs ceph_cache_netfs = { @@ -130,7 +131,8 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; - aux.mtime = timespec64_to_timespec(inode->i_mtime); + aux.mtime_sec = inode->i_mtime.tv_sec; + aux.mtime_nsec = inode->i_mtime.tv_nsec; if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -163,7 +165,8 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) if (!ci->fscache) { memset(&aux, 0, sizeof(aux)); aux.version = ci->i_version; - aux.mtime = timespec64_to_timespec(inode->i_mtime); + aux.mtime_sec = inode->i_mtime.tv_sec; + aux.mtime_nsec = inode->i_mtime.tv_nsec; ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, &ci->i_vino, sizeof(ci->i_vino), diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 990258cbd836..f50cc008632a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1125,7 +1125,7 @@ struct cap_msg_args { u64 flush_tid, oldest_flush_tid, size, max_size; u64 xattr_version; struct ceph_buffer *xattr_buf; - struct timespec atime, mtime, ctime; + struct timespec64 atime, mtime, ctime; int op, caps, wanted, dirty; u32 seq, issue_seq, mseq, time_warp_seq; u32 flags; @@ -1146,7 +1146,7 @@ static int send_cap_msg(struct cap_msg_args *arg) struct ceph_msg *msg; void *p; size_t extra_len; - struct timespec zerotime = {0}; + struct timespec64 zerotime = {0}; struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" @@ -1186,9 +1186,9 @@ static int send_cap_msg(struct cap_msg_args *arg) fc->size = cpu_to_le64(arg->size); fc->max_size = cpu_to_le64(arg->max_size); - ceph_encode_timespec(&fc->mtime, &arg->mtime); - ceph_encode_timespec(&fc->atime, &arg->atime); - ceph_encode_timespec(&fc->ctime, &arg->ctime); + ceph_encode_timespec64(&fc->mtime, &arg->mtime); + ceph_encode_timespec64(&fc->atime, &arg->atime); + ceph_encode_timespec64(&fc->ctime, &arg->ctime); fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid)); @@ -1237,7 +1237,7 @@ static int send_cap_msg(struct cap_msg_args *arg) * We just zero these out for now, as the MDS ignores them unless * the requisite feature flags are set (which we don't do yet). */ - ceph_encode_timespec(p, &zerotime); + ceph_encode_timespec64(p, &zerotime); p += sizeof(struct ceph_timespec); ceph_encode_64(&p, 0); @@ -1360,9 +1360,9 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, arg.xattr_buf = NULL; } - arg.mtime = timespec64_to_timespec(inode->i_mtime); - arg.atime = timespec64_to_timespec(inode->i_atime); - arg.ctime = timespec64_to_timespec(inode->i_ctime); + arg.mtime = inode->i_mtime; + arg.atime = inode->i_atime; + arg.ctime = inode->i_ctime; arg.op = op; arg.caps = cap->implemented; @@ -3148,11 +3148,11 @@ static void handle_cap_grant(struct inode *inode, } if (newcaps & CEPH_CAP_ANY_RD) { - struct timespec mtime, atime, ctime; + struct timespec64 mtime, atime, ctime; /* ctime/mtime/atime? */ - ceph_decode_timespec(&mtime, &grant->mtime); - ceph_decode_timespec(&atime, &grant->atime); - ceph_decode_timespec(&ctime, &grant->ctime); + ceph_decode_timespec64(&mtime, &grant->mtime); + ceph_decode_timespec64(&atime, &grant->atime); + ceph_decode_timespec64(&ctime, &grant->ctime); ceph_fill_file_time(inode, extra_info->issued, le32_to_cpu(grant->time_warp_seq), &ctime, &mtime, &atime); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 0210b02b86d4..82928cea0209 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1397,7 +1397,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, " rfiles: %20lld\n" " rsubdirs: %20lld\n" "rbytes: %20lld\n" - "rctime: %10ld.%09ld\n", + "rctime: %10lld.%09ld\n", ci->i_files + ci->i_subdirs, ci->i_files, ci->i_subdirs, @@ -1405,8 +1405,8 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, ci->i_rfiles, ci->i_rsubdirs, ci->i_rbytes, - (long)ci->i_rctime.tv_sec, - (long)ci->i_rctime.tv_nsec); + ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); } if (*ppos >= dfi->dir_info_len) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index a866be999216..715a6f2a4613 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -658,13 +658,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, } void ceph_fill_file_time(struct inode *inode, int issued, - u64 time_warp_seq, struct timespec *ctime, - struct timespec *mtime, struct timespec *atime) + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, struct timespec64 *atime) { struct ceph_inode_info *ci = ceph_inode(inode); - struct timespec64 ctime64 = timespec_to_timespec64(*ctime); - struct timespec64 mtime64 = timespec_to_timespec64(*mtime); - struct timespec64 atime64 = timespec_to_timespec64(*atime); int warn = 0; if (issued & (CEPH_CAP_FILE_EXCL| @@ -673,39 +670,39 @@ void ceph_fill_file_time(struct inode *inode, int issued, CEPH_CAP_AUTH_EXCL| CEPH_CAP_XATTR_EXCL)) { if (ci->i_version == 0 || - timespec64_compare(&ctime64, &inode->i_ctime) > 0) { + timespec64_compare(ctime, &inode->i_ctime) > 0) { dout("ctime %lld.%09ld -> %lld.%09ld inc w/ cap\n", - (long long)inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - (long long)ctime->tv_sec, ctime->tv_nsec); - inode->i_ctime = ctime64; + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + ctime->tv_sec, ctime->tv_nsec); + inode->i_ctime = *ctime; } if (ci->i_version == 0 || ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) { /* the MDS did a utimes() */ dout("mtime %lld.%09ld -> %lld.%09ld " "tw %d -> %d\n", - (long long)inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - (long long)mtime->tv_sec, mtime->tv_nsec, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + mtime->tv_sec, mtime->tv_nsec, ci->i_time_warp_seq, (int)time_warp_seq); - inode->i_mtime = mtime64; - inode->i_atime = atime64; + inode->i_mtime = *mtime; + inode->i_atime = *atime; ci->i_time_warp_seq = time_warp_seq; } else if (time_warp_seq == ci->i_time_warp_seq) { /* nobody did utimes(); take the max */ - if (timespec64_compare(&mtime64, &inode->i_mtime) > 0) { + if (timespec64_compare(mtime, &inode->i_mtime) > 0) { dout("mtime %lld.%09ld -> %lld.%09ld inc\n", - (long long)inode->i_mtime.tv_sec, + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - (long long)mtime->tv_sec, mtime->tv_nsec); - inode->i_mtime = mtime64; + mtime->tv_sec, mtime->tv_nsec); + inode->i_mtime = *mtime; } - if (timespec64_compare(&atime64, &inode->i_atime) > 0) { + if (timespec64_compare(atime, &inode->i_atime) > 0) { dout("atime %lld.%09ld -> %lld.%09ld inc\n", - (long long)inode->i_atime.tv_sec, + inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - (long long)atime->tv_sec, atime->tv_nsec); - inode->i_atime = atime64; + atime->tv_sec, atime->tv_nsec); + inode->i_atime = *atime; } } else if (issued & CEPH_CAP_FILE_EXCL) { /* we did a utimes(); ignore mds values */ @@ -715,9 +712,9 @@ void ceph_fill_file_time(struct inode *inode, int issued, } else { /* we have no write|excl caps; whatever the MDS says is true */ if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { - inode->i_ctime = ctime64; - inode->i_mtime = mtime64; - inode->i_atime = atime64; + inode->i_ctime = *ctime; + inode->i_mtime = *mtime; + inode->i_atime = *atime; ci->i_time_warp_seq = time_warp_seq; } else { warn = 1; @@ -743,7 +740,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int issued, new_issued, info_caps; - struct timespec mtime, atime, ctime; + struct timespec64 mtime, atime, ctime; struct ceph_buffer *xattr_blob = NULL; struct ceph_string *pool_ns = NULL; struct ceph_cap *new_cap = NULL; @@ -823,9 +820,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & CEPH_CAP_ANY_RD)) { /* be careful with mtime, atime, size */ - ceph_decode_timespec(&atime, &info->atime); - ceph_decode_timespec(&mtime, &info->mtime); - ceph_decode_timespec(&ctime, &info->ctime); + ceph_decode_timespec64(&atime, &info->atime); + ceph_decode_timespec64(&mtime, &info->mtime); + ceph_decode_timespec64(&ctime, &info->ctime); ceph_fill_file_time(inode, issued, le32_to_cpu(info->time_warp_seq), &ctime, &mtime, &atime); @@ -872,7 +869,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ci->i_rbytes = le64_to_cpu(info->rbytes); ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); - ceph_decode_timespec(&ci->i_rctime, &info->rctime); + ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } @@ -1954,7 +1951,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int err = 0; int inode_dirty_flags = 0; bool lock_snap_rwsem = false; - struct timespec ts; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) @@ -2030,8 +2026,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (ia_valid & ATTR_ATIME) { dout("setattr %p atime %lld.%ld -> %lld.%ld\n", inode, - (long long)inode->i_atime.tv_sec, inode->i_atime.tv_nsec, - (long long)attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); + inode->i_atime.tv_sec, inode->i_atime.tv_nsec, + attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; inode->i_atime = attr->ia_atime; @@ -2043,8 +2039,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || !timespec64_equal(&inode->i_atime, &attr->ia_atime)) { - ts = timespec64_to_timespec(attr->ia_atime); - ceph_encode_timespec(&req->r_args.setattr.atime, &ts); + ceph_encode_timespec64(&req->r_args.setattr.atime, + &attr->ia_atime); mask |= CEPH_SETATTR_ATIME; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2052,8 +2048,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } if (ia_valid & ATTR_MTIME) { dout("setattr %p mtime %lld.%ld -> %lld.%ld\n", inode, - (long long)inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, - (long long)attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); + inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec, + attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec); if (issued & CEPH_CAP_FILE_EXCL) { ci->i_time_warp_seq++; inode->i_mtime = attr->ia_mtime; @@ -2065,8 +2061,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) dirtied |= CEPH_CAP_FILE_WR; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || !timespec64_equal(&inode->i_mtime, &attr->ia_mtime)) { - ts = timespec64_to_timespec(attr->ia_mtime); - ceph_encode_timespec(&req->r_args.setattr.mtime, &ts); + ceph_encode_timespec64(&req->r_args.setattr.mtime, + &attr->ia_mtime); mask |= CEPH_SETATTR_MTIME; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2097,8 +2093,8 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME| ATTR_MODE|ATTR_UID|ATTR_GID)) == 0; dout("setattr %p ctime %lld.%ld -> %lld.%ld (%s)\n", inode, - (long long)inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, - (long long)attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, + inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, + attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec, only ? "ctime only" : "ignored"); if (only) { /* diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d2679dce7332..a1a510438b7c 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2961,15 +2961,12 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { - struct timespec ts; rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); rec.v1.size = cpu_to_le64(inode->i_size); - ts = timespec64_to_timespec(inode->i_mtime); - ceph_encode_timespec(&rec.v1.mtime, &ts); - ts = timespec64_to_timespec(inode->i_atime); - ceph_encode_timespec(&rec.v1.atime, &ts); + ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); + ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v1.pathbase = cpu_to_le64(pathbase); } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index af81555c14fd..041c27ea8de1 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -594,9 +594,9 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, BUG_ON(capsnap->writing); capsnap->size = inode->i_size; - capsnap->mtime = timespec64_to_timespec(inode->i_mtime); - capsnap->atime = timespec64_to_timespec(inode->i_atime); - capsnap->ctime = timespec64_to_timespec(inode->i_ctime); + capsnap->mtime = inode->i_mtime; + capsnap->atime = inode->i_atime; + capsnap->ctime = inode->i_ctime; capsnap->time_warp_seq = ci->i_time_warp_seq; capsnap->truncate_size = ci->i_truncate_size; capsnap->truncate_seq = ci->i_truncate_seq; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a7077a0c989f..d3eb70e2b527 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -193,7 +193,7 @@ struct ceph_cap_snap { u64 xattr_version; u64 size; - struct timespec mtime, atime, ctime; + struct timespec64 mtime, atime, ctime; u64 time_warp_seq; u64 truncate_size; u32 truncate_seq; @@ -307,7 +307,7 @@ struct ceph_inode_info { char *i_symlink; /* for dirs */ - struct timespec i_rctime; + struct timespec64 i_rctime; u64 i_rbytes, i_rfiles, i_rsubdirs; u64 i_files, i_subdirs; @@ -857,8 +857,9 @@ extern struct inode *ceph_get_snapdir(struct inode *parent); extern int ceph_fill_file_size(struct inode *inode, int issued, u32 truncate_seq, u64 truncate_size, u64 size); extern void ceph_fill_file_time(struct inode *inode, int issued, - u64 time_warp_seq, struct timespec *ctime, - struct timespec *mtime, struct timespec *atime); + u64 time_warp_seq, struct timespec64 *ctime, + struct timespec64 *mtime, + struct timespec64 *atime); extern int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req); extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 5bc8edb4c2a6..5cc8b94f8206 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -224,8 +224,8 @@ static size_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, size_t size) { - return snprintf(val, size, "%ld.09%ld", (long)ci->i_rctime.tv_sec, - (long)ci->i_rctime.tv_nsec); + return snprintf(val, size, "%lld.09%ld", ci->i_rctime.tv_sec, + ci->i_rctime.tv_nsec); } /* quotas */ From fac02ddf910814c24f5d9d969dfdab5227f6f3eb Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:37 +0200 Subject: [PATCH 16/40] libceph: use timespec64 for r_mtime The request mtime field is used all over ceph, and is currently represented as a 'timespec' structure in Linux. This changes it to timespec64 to allow times beyond 2038, modifying all users at the same time. [ Remove now redundant ts variable in writepage_nounlock(). ] Signed-off-by: Arnd Bergmann Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- fs/ceph/addr.c | 12 +++++------- fs/ceph/file.c | 8 ++++---- include/linux/ceph/osd_client.h | 6 +++--- net/ceph/osd_client.c | 8 ++++---- 5 files changed, 17 insertions(+), 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index c680de15fae0..11f9be10e3fa 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1466,7 +1466,7 @@ static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request) struct ceph_osd_request *osd_req = obj_request->osd_req; osd_req->r_flags = CEPH_OSD_FLAG_WRITE; - ktime_get_real_ts(&osd_req->r_mtime); + ktime_get_real_ts64(&osd_req->r_mtime); osd_req->r_data_offset = obj_request->ex.oe_off; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 292b3d72d725..afcc59ed7090 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -574,7 +574,6 @@ static u64 get_writepages_data_length(struct inode *inode, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { - struct timespec ts; struct inode *inode; struct ceph_inode_info *ci; struct ceph_fs_client *fsc; @@ -625,12 +624,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - ts = timespec64_to_timespec(inode->i_mtime); err = ceph_osdc_writepages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, snapc, page_off, len, ceph_wbc.truncate_seq, ceph_wbc.truncate_size, - &ts, &page, 1); + &inode->i_mtime, &page, 1); if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) @@ -1134,7 +1132,7 @@ new_request: pages = NULL; } - req->r_mtime = timespec64_to_timespec(inode->i_mtime); + req->r_mtime = inode->i_mtime; rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); BUG_ON(rc); req = NULL; @@ -1734,7 +1732,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out; } - req->r_mtime = timespec64_to_timespec(inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1776,7 +1774,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) goto out_put; } - req->r_mtime = timespec64_to_timespec(inode->i_mtime); + req->r_mtime = inode->i_mtime; err = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); @@ -1937,7 +1935,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, 0, false, true); err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); - wr_req->r_mtime = timespec64_to_timespec(ci->vfs_inode.i_mtime); + wr_req->r_mtime = ci->vfs_inode.i_mtime; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ad0bed99b1d5..2f3a30ca94bf 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -721,7 +721,7 @@ struct ceph_aio_request { struct list_head osd_reqs; unsigned num_reqs; atomic_t pending_reqs; - struct timespec mtime; + struct timespec64 mtime; struct ceph_cap_flush *prealloc_cf; }; @@ -923,7 +923,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, int num_pages = 0; int flags; int ret; - struct timespec mtime = timespec64_to_timespec(current_time(inode)); + struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(iter); loff_t pos = iocb->ki_pos; bool write = iov_iter_rw(iter) == WRITE; @@ -1131,7 +1131,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, int flags; int ret; bool check_caps = false; - struct timespec mtime = timespec64_to_timespec(current_time(inode)); + struct timespec64 mtime = current_time(inode); size_t count = iov_iter_count(from); if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) @@ -1663,7 +1663,7 @@ static int ceph_zero_partial_object(struct inode *inode, goto out; } - req->r_mtime = timespec64_to_timespec(inode->i_mtime); + req->r_mtime = inode->i_mtime; ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); if (!ret) { ret = ceph_osdc_wait_request(&fsc->client->osdc, req); diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d6fe7e4df8cf..02096da01845 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -199,7 +199,7 @@ struct ceph_osd_request { /* set by submitter */ u64 r_snapid; /* for reads, CEPH_NOSNAP o/w */ struct ceph_snap_context *r_snapc; /* for writes */ - struct timespec r_mtime; /* ditto */ + struct timespec64 r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ bool r_linger; /* don't resend on failure */ @@ -253,7 +253,7 @@ struct ceph_osd_linger_request { struct ceph_osd_request_target t; u32 map_dne_bound; - struct timespec mtime; + struct timespec64 mtime; struct kref kref; struct mutex lock; @@ -508,7 +508,7 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_snap_context *sc, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, + struct timespec64 *mtime, struct page **pages, int nr_pages); /* watch/notify */ diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 93614cb5e03f..8002b8e9ce24 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1978,7 +1978,7 @@ static void encode_request_partial(struct ceph_osd_request *req, p += sizeof(struct ceph_blkin_trace_info); ceph_encode_32(&p, 0); /* client_inc, always 0 */ - ceph_encode_timespec(p, &req->r_mtime); + ceph_encode_timespec64(p, &req->r_mtime); p += sizeof(struct ceph_timespec); encode_oloc(&p, end, &req->r_t.target_oloc); @@ -4512,7 +4512,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_WRITE; - ktime_get_real_ts(&lreq->mtime); + ktime_get_real_ts64(&lreq->mtime); lreq->reg_req = alloc_linger_request(lreq); if (!lreq->reg_req) { @@ -4570,7 +4570,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; - ktime_get_real_ts(&req->r_mtime); + ktime_get_real_ts64(&req->r_mtime); osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); @@ -5136,7 +5136,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino, struct ceph_snap_context *snapc, u64 off, u64 len, u32 truncate_seq, u64 truncate_size, - struct timespec *mtime, + struct timespec64 *mtime, struct page **pages, int num_pages) { struct ceph_osd_request *req; From 0ed1e90a09eb0e2863cab43e1ed5c5df89566772 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 13 Jul 2018 22:18:38 +0200 Subject: [PATCH 17/40] ceph: use timespec64 for r_stamp The ceph_mds_request stamp still uses the deprecated timespec structure, this converts it over as well. Signed-off-by: Arnd Bergmann Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 2 +- fs/ceph/mds_client.c | 7 +++---- fs/ceph/mds_client.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 715a6f2a4613..d62f65f2875d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2136,7 +2136,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) req->r_inode_drop = release; req->r_args.setattr.mask = cpu_to_le32(mask); req->r_num_caps = 1; - req->r_stamp = timespec64_to_timespec(attr->ia_ctime); + req->r_stamp = attr->ia_ctime; err = ceph_mdsc_do_request(mdsc, NULL, req); } dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index a1a510438b7c..f5a299daae95 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1799,8 +1799,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) INIT_LIST_HEAD(&req->r_unsafe_item); ktime_get_coarse_real_ts64(&ts); - req->r_stamp = timespec64_to_timespec(timespec64_trunc(ts, - mdsc->fsc->sb->s_time_gran)); + req->r_stamp = timespec64_trunc(ts, mdsc->fsc->sb->s_time_gran); req->r_op = op; req->r_direct_mode = mode; @@ -2097,7 +2096,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, /* time stamp */ { struct ceph_timespec ts; - ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(&p, &ts, sizeof(ts)); } @@ -2190,7 +2189,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, p = msg->front.iov_base + req->r_request_release_offset; { struct ceph_timespec ts; - ceph_encode_timespec(&ts, &req->r_stamp); + ceph_encode_timespec64(&ts, &req->r_stamp); ceph_encode_copy(&p, &ts, sizeof(ts)); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 2ec3b5b35067..80a7523e1523 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -229,7 +229,7 @@ struct ceph_mds_request { int r_fmode; /* file mode, if expecting cap */ kuid_t r_uid; kgid_t r_gid; - struct timespec r_stamp; + struct timespec64 r_stamp; /* for choosing which mds to send this request to */ int r_direct_mode; From f7e52d8efe8588c5d4b4c78eb33da81d89486c1a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 23 Jul 2018 14:11:40 +0200 Subject: [PATCH 18/40] libceph: remove now unused ceph_{en,de}code_timespec() Signed-off-by: Ilya Dryomov --- include/linux/ceph/decode.h | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h index 094b9b4a34f3..a6c2a48d42e0 100644 --- a/include/linux/ceph/decode.h +++ b/include/linux/ceph/decode.h @@ -214,18 +214,6 @@ static inline void ceph_encode_timespec64(struct ceph_timespec *tv, tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); } -static inline void ceph_decode_timespec(struct timespec *ts, - const struct ceph_timespec *tv) -{ - ts->tv_sec = (__kernel_time_t)le32_to_cpu(tv->tv_sec); - ts->tv_nsec = (long)le32_to_cpu(tv->tv_nsec); -} -static inline void ceph_encode_timespec(struct ceph_timespec *tv, - const struct timespec *ts) -{ - tv->tv_sec = cpu_to_le32((u32)ts->tv_sec); - tv->tv_nsec = cpu_to_le32((u32)ts->tv_nsec); -} /* * sockaddr_storage <-> ceph_sockaddr From 24499847e4471b0aba8126cebd640084136fc2bb Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Mon, 23 Jul 2018 21:32:24 +0530 Subject: [PATCH 19/40] ceph: adding new return type vm_fault_t Use new return type vm_fault_t for page_mkwrite and fault handler. Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 62 ++++++++++++++++++++++++++------------------------ 1 file changed, 32 insertions(+), 30 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index afcc59ed7090..9c332a6f6667 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1429,7 +1429,7 @@ static void ceph_restore_sigs(sigset_t *oldset) /* * vm ops */ -static int ceph_filemap_fault(struct vm_fault *vmf) +static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); @@ -1437,8 +1437,9 @@ static int ceph_filemap_fault(struct vm_fault *vmf) struct ceph_file_info *fi = vma->vm_file->private_data; struct page *pinned_page = NULL; loff_t off = vmf->pgoff << PAGE_SHIFT; - int want, got, ret; + int want, got, err; sigset_t oldset; + vm_fault_t ret = VM_FAULT_SIGBUS; ceph_block_sigs(&oldset); @@ -1450,8 +1451,8 @@ static int ceph_filemap_fault(struct vm_fault *vmf) want = CEPH_CAP_FILE_CACHE; got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); - if (ret < 0) + err = ceph_get_caps(ci, CEPH_CAP_FILE_RD, want, -1, &got, &pinned_page); + if (err < 0) goto out_restore; dout("filemap_fault %p %llu~%zd got cap refs on %s\n", @@ -1463,16 +1464,17 @@ static int ceph_filemap_fault(struct vm_fault *vmf) ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); + dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n", + inode, off, (size_t)PAGE_SIZE, + ceph_cap_string(got), ret); } else - ret = -EAGAIN; + err = -EAGAIN; - dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n", - inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got), ret); if (pinned_page) put_page(pinned_page); ceph_put_cap_refs(ci, got); - if (ret != -EAGAIN) + if (err != -EAGAIN) goto out_restore; /* read inline data */ @@ -1480,7 +1482,6 @@ static int ceph_filemap_fault(struct vm_fault *vmf) /* does not support inline data > PAGE_SIZE */ ret = VM_FAULT_SIGBUS; } else { - int ret1; struct address_space *mapping = inode->i_mapping; struct page *page = find_or_create_page(mapping, 0, mapping_gfp_constraint(mapping, @@ -1489,32 +1490,32 @@ static int ceph_filemap_fault(struct vm_fault *vmf) ret = VM_FAULT_OOM; goto out_inline; } - ret1 = __ceph_do_getattr(inode, page, + err = __ceph_do_getattr(inode, page, CEPH_STAT_CAP_INLINE_DATA, true); - if (ret1 < 0 || off >= i_size_read(inode)) { + if (err < 0 || off >= i_size_read(inode)) { unlock_page(page); put_page(page); - if (ret1 < 0) - ret = ret1; + if (err == -ENOMEM) + ret = VM_FAULT_OOM; else ret = VM_FAULT_SIGBUS; goto out_inline; } - if (ret1 < PAGE_SIZE) - zero_user_segment(page, ret1, PAGE_SIZE); + if (err < PAGE_SIZE) + zero_user_segment(page, err, PAGE_SIZE); else flush_dcache_page(page); SetPageUptodate(page); vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: - dout("filemap_fault %p %llu~%zd read inline data ret %d\n", + dout("filemap_fault %p %llu~%zd read inline data ret %x\n", inode, off, (size_t)PAGE_SIZE, ret); } out_restore: ceph_restore_sigs(&oldset); - if (ret < 0) - ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + if (err < 0) + ret = vmf_error(err); return ret; } @@ -1522,7 +1523,7 @@ out_restore: /* * Reuse write_begin here for simplicity. */ -static int ceph_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); @@ -1533,8 +1534,9 @@ static int ceph_page_mkwrite(struct vm_fault *vmf) loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; - int want, got, ret; + int want, got, err; sigset_t oldset; + vm_fault_t ret = VM_FAULT_SIGBUS; prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) @@ -1548,10 +1550,10 @@ static int ceph_page_mkwrite(struct vm_fault *vmf) lock_page(page); locked_page = page; } - ret = ceph_uninline_data(vma->vm_file, locked_page); + err = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) + if (err < 0) goto out_free; } @@ -1568,9 +1570,9 @@ static int ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, + err = ceph_get_caps(ci, CEPH_CAP_FILE_WR, want, off + len, &got, NULL); - if (ret < 0) + if (err < 0) goto out_free; dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", @@ -1588,13 +1590,13 @@ static int ceph_page_mkwrite(struct vm_fault *vmf) break; } - ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret >= 0) { + err = ceph_update_writeable_page(vma->vm_file, off, len, page); + if (err >= 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; } - } while (ret == -EAGAIN); + } while (err == -EAGAIN); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { @@ -1608,14 +1610,14 @@ static int ceph_page_mkwrite(struct vm_fault *vmf) __mark_inode_dirty(inode, dirty); } - dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", + dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n", inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); out_free: ceph_restore_sigs(&oldset); ceph_free_cap_flush(prealloc_cf); - if (ret < 0) - ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; + if (err < 0) + ret = vmf_error(err); return ret; } From 24e1dd6afde9e9cca405df8ca98be59419ea7229 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Jul 2018 12:29:07 -0700 Subject: [PATCH 20/40] ceph: fix whitespace Remove blank lines at end of file and trailing whitespace. Signed-off-by: Stephen Hemminger Signed-off-by: Ilya Dryomov --- net/ceph/Kconfig | 1 - net/ceph/Makefile | 1 - net/ceph/auth_none.c | 1 - net/ceph/auth_none.h | 1 - net/ceph/auth_x.c | 2 -- net/ceph/auth_x.h | 1 - net/ceph/mon_client.c | 2 +- net/ceph/pagevec.c | 1 - 8 files changed, 1 insertion(+), 9 deletions(-) diff --git a/net/ceph/Kconfig b/net/ceph/Kconfig index f8cceb99e732..cd2d5b9301a1 100644 --- a/net/ceph/Kconfig +++ b/net/ceph/Kconfig @@ -41,4 +41,3 @@ config CEPH_LIB_USE_DNS_RESOLVER Documentation/networking/dns_resolver.txt If unsure, say N. - diff --git a/net/ceph/Makefile b/net/ceph/Makefile index 12bf49772d24..db09defe27d0 100644 --- a/net/ceph/Makefile +++ b/net/ceph/Makefile @@ -15,4 +15,3 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ auth_x.o \ ceph_fs.o ceph_strings.o ceph_hash.o \ pagevec.o snapshot.o string_table.o - diff --git a/net/ceph/auth_none.c b/net/ceph/auth_none.c index 41d2a0c72236..edb7042479ed 100644 --- a/net/ceph/auth_none.c +++ b/net/ceph/auth_none.c @@ -142,4 +142,3 @@ int ceph_auth_none_init(struct ceph_auth_client *ac) ac->ops = &ceph_auth_none_ops; return 0; } - diff --git a/net/ceph/auth_none.h b/net/ceph/auth_none.h index 860ed9875791..4158f064302e 100644 --- a/net/ceph/auth_none.h +++ b/net/ceph/auth_none.h @@ -26,4 +26,3 @@ struct ceph_auth_none_info { int ceph_auth_none_init(struct ceph_auth_client *ac); #endif - diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index b05c3a540a5a..6caac27fca85 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -823,5 +823,3 @@ out_nomem: out: return ret; } - - diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index 57ba99f7736f..c03735f96df9 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -52,4 +52,3 @@ struct ceph_x_info { int ceph_x_init(struct ceph_auth_client *ac); #endif - diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index d7a7a2330ef7..18deb3d889c4 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1249,7 +1249,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) if (monc->client->extra_mon_dispatch && monc->client->extra_mon_dispatch(monc->client, msg) == 0) break; - + pr_err("received unknown message type %d %s\n", type, ceph_msg_type_name(type)); } diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c index e560d3975f41..d3736f5bffec 100644 --- a/net/ceph/pagevec.c +++ b/net/ceph/pagevec.c @@ -197,4 +197,3 @@ void ceph_zero_page_vector_range(int off, int len, struct page **pages) } } EXPORT_SYMBOL(ceph_zero_page_vector_range); - From 262614c4294d33b1f19e0d18c0091d9c329b544a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 Jul 2018 15:17:46 +0200 Subject: [PATCH 21/40] libceph: store ceph_auth_handshake pointer in ceph_connection We already copy authorizer_reply_buf and authorizer_reply_buf_len into ceph_connection. Factoring out __prepare_write_connect() requires two more: authorizer_buf and authorizer_buf_len. Store the pointer to the handshake in con->auth rather than piling on. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/messenger.h | 3 +- net/ceph/messenger.c | 54 ++++++++++++++++------------------ 2 files changed, 27 insertions(+), 30 deletions(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index a718b877c597..021718570b50 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -286,9 +286,8 @@ struct ceph_connection { attempt for this connection, client */ u32 peer_global_seq; /* peer's global seq for this connection */ + struct ceph_auth_handshake *auth; int auth_retry; /* true if we need a newer authorizer */ - void *auth_reply_buf; /* where to put the authorizer reply */ - int auth_reply_buf_len; struct mutex mutex; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 3f6336248509..b6ebd2cc16a1 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1434,24 +1434,26 @@ static void prepare_write_keepalive(struct ceph_connection *con) * Connection negotiation. */ -static struct ceph_auth_handshake *get_connect_authorizer(struct ceph_connection *con, - int *auth_proto) +static int get_connect_authorizer(struct ceph_connection *con) { struct ceph_auth_handshake *auth; + int auth_proto; if (!con->ops->get_authorizer) { + con->auth = NULL; con->out_connect.authorizer_protocol = CEPH_AUTH_UNKNOWN; con->out_connect.authorizer_len = 0; - return NULL; + return 0; } - auth = con->ops->get_authorizer(con, auth_proto, con->auth_retry); + auth = con->ops->get_authorizer(con, &auth_proto, con->auth_retry); if (IS_ERR(auth)) - return auth; + return PTR_ERR(auth); - con->auth_reply_buf = auth->authorizer_reply_buf; - con->auth_reply_buf_len = auth->authorizer_reply_buf_len; - return auth; + con->auth = auth; + con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); + con->out_connect.authorizer_len = cpu_to_le32(auth->authorizer_buf_len); + return 0; } /* @@ -1471,8 +1473,7 @@ static int prepare_write_connect(struct ceph_connection *con) { unsigned int global_seq = get_global_seq(con->msgr, 0); int proto; - int auth_proto; - struct ceph_auth_handshake *auth; + int ret; switch (con->peer_name.type) { case CEPH_ENTITY_TYPE_MON: @@ -1499,20 +1500,15 @@ static int prepare_write_connect(struct ceph_connection *con) con->out_connect.protocol_version = cpu_to_le32(proto); con->out_connect.flags = 0; - auth_proto = CEPH_AUTH_UNKNOWN; - auth = get_connect_authorizer(con, &auth_proto); - if (IS_ERR(auth)) - return PTR_ERR(auth); - - con->out_connect.authorizer_protocol = cpu_to_le32(auth_proto); - con->out_connect.authorizer_len = auth ? - cpu_to_le32(auth->authorizer_buf_len) : 0; + ret = get_connect_authorizer(con); + if (ret) + return ret; con_out_kvec_add(con, sizeof (con->out_connect), &con->out_connect); - if (auth && auth->authorizer_buf_len) - con_out_kvec_add(con, auth->authorizer_buf_len, - auth->authorizer_buf); + if (con->auth) + con_out_kvec_add(con, con->auth->authorizer_buf_len, + con->auth->authorizer_buf); con->out_more = 0; con_flag_set(con, CON_FLAG_WRITE_PENDING); @@ -1781,11 +1777,14 @@ static int read_partial_connect(struct ceph_connection *con) if (ret <= 0) goto out; - size = le32_to_cpu(con->in_reply.authorizer_len); - end += size; - ret = read_partial(con, end, size, con->auth_reply_buf); - if (ret <= 0) - goto out; + if (con->auth) { + size = le32_to_cpu(con->in_reply.authorizer_len); + end += size; + ret = read_partial(con, end, size, + con->auth->authorizer_reply_buf); + if (ret <= 0) + goto out; + } dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n", con, (int)con->in_reply.tag, @@ -1793,7 +1792,6 @@ static int read_partial_connect(struct ceph_connection *con) le32_to_cpu(con->in_reply.global_seq)); out: return ret; - } /* @@ -2076,7 +2074,7 @@ static int process_connect(struct ceph_connection *con) dout("process_connect on %p tag %d\n", con, (int)con->in_tag); - if (con->auth_reply_buf) { + if (con->auth) { /* * Any connection that defines ->get_authorizer() * should also define ->verify_authorizer_reply(). From c0f56b483aa09c99bfe97409a43ad786f33b8a5a Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 Jul 2018 17:43:47 +0200 Subject: [PATCH 22/40] libceph: factor out __prepare_write_connect() Will be used for sending ceph_msg_connect with an updated authorizer, after the server challenges the initial authorizer. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index b6ebd2cc16a1..500cc3da586f 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1469,6 +1469,17 @@ static void prepare_write_banner(struct ceph_connection *con) con_flag_set(con, CON_FLAG_WRITE_PENDING); } +static void __prepare_write_connect(struct ceph_connection *con) +{ + con_out_kvec_add(con, sizeof(con->out_connect), &con->out_connect); + if (con->auth) + con_out_kvec_add(con, con->auth->authorizer_buf_len, + con->auth->authorizer_buf); + + con->out_more = 0; + con_flag_set(con, CON_FLAG_WRITE_PENDING); +} + static int prepare_write_connect(struct ceph_connection *con) { unsigned int global_seq = get_global_seq(con->msgr, 0); @@ -1504,15 +1515,7 @@ static int prepare_write_connect(struct ceph_connection *con) if (ret) return ret; - con_out_kvec_add(con, sizeof (con->out_connect), - &con->out_connect); - if (con->auth) - con_out_kvec_add(con, con->auth->authorizer_buf_len, - con->auth->authorizer_buf); - - con->out_more = 0; - con_flag_set(con, CON_FLAG_WRITE_PENDING); - + __prepare_write_connect(con); return 0; } From c571fe24d243bfe7017f0e67fe800b3cc2a1d1f7 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 26 Jul 2018 18:05:43 +0200 Subject: [PATCH 23/40] libceph: factor out __ceph_x_decrypt() Will be used for decrypting the server challenge which is only preceded by ceph_x_encrypt_header. Drop struct_v check to allow for extending ceph_x_encrypt_header in the future. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/auth_x.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 6caac27fca85..cd1118d106a5 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -70,25 +70,40 @@ static int ceph_x_encrypt(struct ceph_crypto_key *secret, void *buf, return sizeof(u32) + ciphertext_len; } +static int __ceph_x_decrypt(struct ceph_crypto_key *secret, void *p, + int ciphertext_len) +{ + struct ceph_x_encrypt_header *hdr = p; + int plaintext_len; + int ret; + + ret = ceph_crypt(secret, false, p, ciphertext_len, ciphertext_len, + &plaintext_len); + if (ret) + return ret; + + if (le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) { + pr_err("%s bad magic\n", __func__); + return -EINVAL; + } + + return plaintext_len - sizeof(*hdr); +} + static int ceph_x_decrypt(struct ceph_crypto_key *secret, void **p, void *end) { - struct ceph_x_encrypt_header *hdr = *p + sizeof(u32); - int ciphertext_len, plaintext_len; + int ciphertext_len; int ret; ceph_decode_32_safe(p, end, ciphertext_len, e_inval); ceph_decode_need(p, end, ciphertext_len, e_inval); - ret = ceph_crypt(secret, false, *p, end - *p, ciphertext_len, - &plaintext_len); - if (ret) + ret = __ceph_x_decrypt(secret, *p, ciphertext_len); + if (ret < 0) return ret; - if (hdr->struct_v != 1 || le64_to_cpu(hdr->magic) != CEPHX_ENC_MAGIC) - return -EPERM; - *p += ciphertext_len; - return plaintext_len - sizeof(struct ceph_x_encrypt_header); + return ret; e_inval: return -EINVAL; From 149cac4a50b0b4081b38b2f38de6ef71c27eaa85 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 16:37:54 +0200 Subject: [PATCH 24/40] libceph: factor out encrypt_authorizer() Will be used for encrypting both the initial and updated authorizers. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/auth_x.c | 49 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 13 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index cd1118d106a5..61cccb93f653 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -290,6 +290,38 @@ bad: return -EINVAL; } +/* + * Encode and encrypt the second part (ceph_x_authorize_b) of the + * authorizer. The first part (ceph_x_authorize_a) should already be + * encoded. + */ +static int encrypt_authorizer(struct ceph_x_authorizer *au) +{ + struct ceph_x_authorize_a *msg_a; + struct ceph_x_authorize_b *msg_b; + void *p, *end; + int ret; + + msg_a = au->buf->vec.iov_base; + WARN_ON(msg_a->ticket_blob.secret_id != cpu_to_le64(au->secret_id)); + p = (void *)(msg_a + 1) + le32_to_cpu(msg_a->ticket_blob.blob_len); + end = au->buf->vec.iov_base + au->buf->vec.iov_len; + + msg_b = p + ceph_x_encrypt_offset(); + msg_b->struct_v = 1; + msg_b->nonce = cpu_to_le64(au->nonce); + + ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); + if (ret < 0) + return ret; + + p += ret; + WARN_ON(p > end); + au->buf->vec.iov_len = p - au->buf->vec.iov_base; + + return 0; +} + static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au) { ceph_crypto_key_destroy(&au->session_key); @@ -306,7 +338,6 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, int maxlen; struct ceph_x_authorize_a *msg_a; struct ceph_x_authorize_b *msg_b; - void *p, *end; int ret; int ticket_blob_len = (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0); @@ -350,21 +381,13 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, dout(" th %p secret_id %lld %lld\n", th, th->secret_id, le64_to_cpu(msg_a->ticket_blob.secret_id)); - p = msg_a + 1; - p += ticket_blob_len; - end = au->buf->vec.iov_base + au->buf->vec.iov_len; - - msg_b = p + ceph_x_encrypt_offset(); - msg_b->struct_v = 1; get_random_bytes(&au->nonce, sizeof(au->nonce)); - msg_b->nonce = cpu_to_le64(au->nonce); - ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); - if (ret < 0) + ret = encrypt_authorizer(au); + if (ret) { + pr_err("failed to encrypt authorizer: %d", ret); goto out_au; + } - p += ret; - WARN_ON(p > end); - au->buf->vec.iov_len = p - au->buf->vec.iov_base; dout(" built authorizer nonce %llx len %d\n", au->nonce, (int)au->buf->vec.iov_len); return 0; From 6daca13d2e72bedaaacfc08f873114c9307d5aea Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 19:18:34 +0200 Subject: [PATCH 25/40] libceph: add authorizer challenge When a client authenticates with a service, an authorizer is sent with a nonce to the service (ceph_x_authorize_[ab]) and the service responds with a mutation of that nonce (ceph_x_authorize_reply). This lets the client verify the service is who it says it is but it doesn't protect against a replay: someone can trivially capture the exchange and reuse the same authorizer to authenticate themselves. Allow the service to reject an initial authorizer with a random challenge (ceph_x_authorize_challenge). The client then has to respond with an updated authorizer proving they are able to decrypt the service's challenge and that the new authorizer was produced for this specific connection instance. The accepting side requires this challenge and response unconditionally if the client side advertises they have CEPHX_V2 feature bit. This addresses CVE-2018-1128. Link: http://tracker.ceph.com/issues/24836 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- fs/ceph/mds_client.c | 11 ++++++ include/linux/ceph/auth.h | 8 ++++ include/linux/ceph/messenger.h | 3 ++ include/linux/ceph/msgr.h | 2 +- net/ceph/auth.c | 16 ++++++++ net/ceph/auth_x.c | 72 +++++++++++++++++++++++++++++++--- net/ceph/auth_x_protocol.h | 7 ++++ net/ceph/messenger.c | 17 +++++++- net/ceph/osd_client.c | 11 ++++++ 9 files changed, 140 insertions(+), 7 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index f5a299daae95..4fc7582233db 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4154,6 +4154,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, return auth; } +static int add_authorizer_challenge(struct ceph_connection *con, + void *challenge_buf, int challenge_buf_len) +{ + struct ceph_mds_session *s = con->private; + struct ceph_mds_client *mdsc = s->s_mdsc; + struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth; + + return ceph_auth_add_authorizer_challenge(ac, s->s_auth.authorizer, + challenge_buf, challenge_buf_len); +} static int verify_authorizer_reply(struct ceph_connection *con) { @@ -4217,6 +4227,7 @@ static const struct ceph_connection_operations mds_con_ops = { .put = con_put, .dispatch = dispatch, .get_authorizer = get_authorizer, + .add_authorizer_challenge = add_authorizer_challenge, .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .peer_reset = peer_reset, diff --git a/include/linux/ceph/auth.h b/include/linux/ceph/auth.h index e931da8424a4..6728c2ee0205 100644 --- a/include/linux/ceph/auth.h +++ b/include/linux/ceph/auth.h @@ -64,6 +64,10 @@ struct ceph_auth_client_ops { /* ensure that an existing authorizer is up to date */ int (*update_authorizer)(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *auth); + int (*add_authorizer_challenge)(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len); int (*verify_authorizer_reply)(struct ceph_auth_client *ac, struct ceph_authorizer *a); void (*invalidate_authorizer)(struct ceph_auth_client *ac, @@ -118,6 +122,10 @@ void ceph_auth_destroy_authorizer(struct ceph_authorizer *a); extern int ceph_auth_update_authorizer(struct ceph_auth_client *ac, int peer_type, struct ceph_auth_handshake *a); +int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len); extern int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a); extern void ceph_auth_invalidate_authorizer(struct ceph_auth_client *ac, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 021718570b50..fc2b4491ee0a 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -31,6 +31,9 @@ struct ceph_connection_operations { struct ceph_auth_handshake *(*get_authorizer) ( struct ceph_connection *con, int *proto, int force_new); + int (*add_authorizer_challenge)(struct ceph_connection *con, + void *challenge_buf, + int challenge_buf_len); int (*verify_authorizer_reply) (struct ceph_connection *con); int (*invalidate_authorizer)(struct ceph_connection *con); diff --git a/include/linux/ceph/msgr.h b/include/linux/ceph/msgr.h index 73ae2a926548..9e50aede46c8 100644 --- a/include/linux/ceph/msgr.h +++ b/include/linux/ceph/msgr.h @@ -91,7 +91,7 @@ struct ceph_entity_inst { #define CEPH_MSGR_TAG_SEQ 13 /* 64-bit int follows with seen seq number */ #define CEPH_MSGR_TAG_KEEPALIVE2 14 /* keepalive2 byte + ceph_timespec */ #define CEPH_MSGR_TAG_KEEPALIVE2_ACK 15 /* keepalive2 reply */ - +#define CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER 16 /* cephx v2 doing server challenge */ /* * connection negotiation diff --git a/net/ceph/auth.c b/net/ceph/auth.c index dbde2b3c3c15..fbeee068ea14 100644 --- a/net/ceph/auth.c +++ b/net/ceph/auth.c @@ -315,6 +315,22 @@ int ceph_auth_update_authorizer(struct ceph_auth_client *ac, } EXPORT_SYMBOL(ceph_auth_update_authorizer); +int ceph_auth_add_authorizer_challenge(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len) +{ + int ret = 0; + + mutex_lock(&ac->mutex); + if (ac->ops && ac->ops->add_authorizer_challenge) + ret = ac->ops->add_authorizer_challenge(ac, a, challenge_buf, + challenge_buf_len); + mutex_unlock(&ac->mutex); + return ret; +} +EXPORT_SYMBOL(ceph_auth_add_authorizer_challenge); + int ceph_auth_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a) { diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 61cccb93f653..512eed4291fe 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -295,7 +295,8 @@ bad: * authorizer. The first part (ceph_x_authorize_a) should already be * encoded. */ -static int encrypt_authorizer(struct ceph_x_authorizer *au) +static int encrypt_authorizer(struct ceph_x_authorizer *au, + u64 *server_challenge) { struct ceph_x_authorize_a *msg_a; struct ceph_x_authorize_b *msg_b; @@ -308,16 +309,28 @@ static int encrypt_authorizer(struct ceph_x_authorizer *au) end = au->buf->vec.iov_base + au->buf->vec.iov_len; msg_b = p + ceph_x_encrypt_offset(); - msg_b->struct_v = 1; + msg_b->struct_v = 2; msg_b->nonce = cpu_to_le64(au->nonce); + if (server_challenge) { + msg_b->have_challenge = 1; + msg_b->server_challenge_plus_one = + cpu_to_le64(*server_challenge + 1); + } else { + msg_b->have_challenge = 0; + msg_b->server_challenge_plus_one = 0; + } ret = ceph_x_encrypt(&au->session_key, p, end - p, sizeof(*msg_b)); if (ret < 0) return ret; p += ret; - WARN_ON(p > end); - au->buf->vec.iov_len = p - au->buf->vec.iov_base; + if (server_challenge) { + WARN_ON(p != end); + } else { + WARN_ON(p > end); + au->buf->vec.iov_len = p - au->buf->vec.iov_base; + } return 0; } @@ -382,7 +395,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac, le64_to_cpu(msg_a->ticket_blob.secret_id)); get_random_bytes(&au->nonce, sizeof(au->nonce)); - ret = encrypt_authorizer(au); + ret = encrypt_authorizer(au, NULL); if (ret) { pr_err("failed to encrypt authorizer: %d", ret); goto out_au; @@ -664,6 +677,54 @@ static int ceph_x_update_authorizer( return 0; } +static int decrypt_authorize_challenge(struct ceph_x_authorizer *au, + void *challenge_buf, + int challenge_buf_len, + u64 *server_challenge) +{ + struct ceph_x_authorize_challenge *ch = + challenge_buf + sizeof(struct ceph_x_encrypt_header); + int ret; + + /* no leading len */ + ret = __ceph_x_decrypt(&au->session_key, challenge_buf, + challenge_buf_len); + if (ret < 0) + return ret; + if (ret < sizeof(*ch)) { + pr_err("bad size %d for ceph_x_authorize_challenge\n", ret); + return -EINVAL; + } + + *server_challenge = le64_to_cpu(ch->server_challenge); + return 0; +} + +static int ceph_x_add_authorizer_challenge(struct ceph_auth_client *ac, + struct ceph_authorizer *a, + void *challenge_buf, + int challenge_buf_len) +{ + struct ceph_x_authorizer *au = (void *)a; + u64 server_challenge; + int ret; + + ret = decrypt_authorize_challenge(au, challenge_buf, challenge_buf_len, + &server_challenge); + if (ret) { + pr_err("failed to decrypt authorize challenge: %d", ret); + return ret; + } + + ret = encrypt_authorizer(au, &server_challenge); + if (ret) { + pr_err("failed to encrypt authorizer w/ challenge: %d", ret); + return ret; + } + + return 0; +} + static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, struct ceph_authorizer *a) { @@ -816,6 +877,7 @@ static const struct ceph_auth_client_ops ceph_x_ops = { .handle_reply = ceph_x_handle_reply, .create_authorizer = ceph_x_create_authorizer, .update_authorizer = ceph_x_update_authorizer, + .add_authorizer_challenge = ceph_x_add_authorizer_challenge, .verify_authorizer_reply = ceph_x_verify_authorizer_reply, .invalidate_authorizer = ceph_x_invalidate_authorizer, .reset = ceph_x_reset, diff --git a/net/ceph/auth_x_protocol.h b/net/ceph/auth_x_protocol.h index 32c13d763b9a..24b0b74564d0 100644 --- a/net/ceph/auth_x_protocol.h +++ b/net/ceph/auth_x_protocol.h @@ -70,6 +70,13 @@ struct ceph_x_authorize_a { struct ceph_x_authorize_b { __u8 struct_v; __le64 nonce; + __u8 have_challenge; + __le64 server_challenge_plus_one; +} __attribute__ ((packed)); + +struct ceph_x_authorize_challenge { + __u8 struct_v; + __le64 server_challenge; } __attribute__ ((packed)); struct ceph_x_authorize_reply { diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 500cc3da586f..e915c8bce117 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2080,9 +2080,24 @@ static int process_connect(struct ceph_connection *con) if (con->auth) { /* * Any connection that defines ->get_authorizer() - * should also define ->verify_authorizer_reply(). + * should also define ->add_authorizer_challenge() and + * ->verify_authorizer_reply(). + * * See get_connect_authorizer(). */ + if (con->in_reply.tag == CEPH_MSGR_TAG_CHALLENGE_AUTHORIZER) { + ret = con->ops->add_authorizer_challenge( + con, con->auth->authorizer_reply_buf, + le32_to_cpu(con->in_reply.authorizer_len)); + if (ret < 0) + return ret; + + con_out_kvec_reset(con); + __prepare_write_connect(con); + prepare_read_connect(con); + return 0; + } + ret = con->ops->verify_authorizer_reply(con); if (ret < 0) { con->error_msg = "bad authorize reply"; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 8002b8e9ce24..60934bd8796c 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5393,6 +5393,16 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con, return auth; } +static int add_authorizer_challenge(struct ceph_connection *con, + void *challenge_buf, int challenge_buf_len) +{ + struct ceph_osd *o = con->private; + struct ceph_osd_client *osdc = o->o_osdc; + struct ceph_auth_client *ac = osdc->client->monc.auth; + + return ceph_auth_add_authorizer_challenge(ac, o->o_auth.authorizer, + challenge_buf, challenge_buf_len); +} static int verify_authorizer_reply(struct ceph_connection *con) { @@ -5442,6 +5452,7 @@ static const struct ceph_connection_operations osd_con_ops = { .put = put_osd_con, .dispatch = dispatch, .get_authorizer = get_authorizer, + .add_authorizer_challenge = add_authorizer_challenge, .verify_authorizer_reply = verify_authorizer_reply, .invalidate_authorizer = invalidate_authorizer, .alloc_msg = alloc_msg, From cc255c76c70f7a87d97939621eae04b600d9f4a1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 19:25:32 +0200 Subject: [PATCH 26/40] libceph: implement CEPHX_V2 calculation mode Derive the signature from the entire buffer (both AES cipher blocks) instead of using just the first half of the first block, leaving out data_crc entirely. This addresses CVE-2018-1129. Link: http://tracker.ceph.com/issues/24837 Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- include/linux/ceph/ceph_features.h | 7 +-- net/ceph/auth_x.c | 73 +++++++++++++++++++++++------- 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 3901927cf6a0..6b92b3395fa9 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -165,9 +165,9 @@ DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap DEFINE_CEPH_FEATURE(59, 1, FS_BTIME) DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap -DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit* +DEFINE_CEPH_FEATURE(60, 1, OSD_RECOVERY_DELETES) // *do not share this bit* +DEFINE_CEPH_FEATURE(61, 1, CEPHX_V2) // *do not share this bit* -DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down! DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing @@ -210,7 +210,8 @@ DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facin CEPH_FEATURE_SERVER_JEWEL | \ CEPH_FEATURE_MON_STATEFUL_SUB | \ CEPH_FEATURE_CRUSH_TUNABLES5 | \ - CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) + CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING | \ + CEPH_FEATURE_CEPHX_V2) #define CEPH_FEATURES_REQUIRED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 512eed4291fe..462786f571e7 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -803,26 +804,64 @@ static int calc_signature(struct ceph_x_authorizer *au, struct ceph_msg *msg, __le64 *psig) { void *enc_buf = au->enc_buf; - struct { - __le32 len; - __le32 header_crc; - __le32 front_crc; - __le32 middle_crc; - __le32 data_crc; - } __packed *sigblock = enc_buf + ceph_x_encrypt_offset(); int ret; - sigblock->len = cpu_to_le32(4*sizeof(u32)); - sigblock->header_crc = msg->hdr.crc; - sigblock->front_crc = msg->footer.front_crc; - sigblock->middle_crc = msg->footer.middle_crc; - sigblock->data_crc = msg->footer.data_crc; - ret = ceph_x_encrypt(&au->session_key, enc_buf, CEPHX_AU_ENC_BUF_LEN, - sizeof(*sigblock)); - if (ret < 0) - return ret; + if (!CEPH_HAVE_FEATURE(msg->con->peer_features, CEPHX_V2)) { + struct { + __le32 len; + __le32 header_crc; + __le32 front_crc; + __le32 middle_crc; + __le32 data_crc; + } __packed *sigblock = enc_buf + ceph_x_encrypt_offset(); + + sigblock->len = cpu_to_le32(4*sizeof(u32)); + sigblock->header_crc = msg->hdr.crc; + sigblock->front_crc = msg->footer.front_crc; + sigblock->middle_crc = msg->footer.middle_crc; + sigblock->data_crc = msg->footer.data_crc; + + ret = ceph_x_encrypt(&au->session_key, enc_buf, + CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock)); + if (ret < 0) + return ret; + + *psig = *(__le64 *)(enc_buf + sizeof(u32)); + } else { + struct { + __le32 header_crc; + __le32 front_crc; + __le32 front_len; + __le32 middle_crc; + __le32 middle_len; + __le32 data_crc; + __le32 data_len; + __le32 seq_lower_word; + } __packed *sigblock = enc_buf; + struct { + __le64 a, b, c, d; + } __packed *penc = enc_buf; + int ciphertext_len; + + sigblock->header_crc = msg->hdr.crc; + sigblock->front_crc = msg->footer.front_crc; + sigblock->front_len = msg->hdr.front_len; + sigblock->middle_crc = msg->footer.middle_crc; + sigblock->middle_len = msg->hdr.middle_len; + sigblock->data_crc = msg->footer.data_crc; + sigblock->data_len = msg->hdr.data_len; + sigblock->seq_lower_word = *(__le32 *)&msg->hdr.seq; + + /* no leading len, no ceph_x_encrypt_header */ + ret = ceph_crypt(&au->session_key, true, enc_buf, + CEPHX_AU_ENC_BUF_LEN, sizeof(*sigblock), + &ciphertext_len); + if (ret) + return ret; + + *psig = penc->a ^ penc->b ^ penc->c ^ penc->d; + } - *psig = *(__le64 *)(enc_buf + sizeof(u32)); return 0; } From 130f52f2b203aa0aec179341916ffb2e905f3afd Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 19:40:30 +0200 Subject: [PATCH 27/40] libceph: check authorizer reply/challenge length before reading Avoid scribbling over memory if the received reply/challenge is larger than the buffer supplied with the authorizer. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/messenger.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index e915c8bce117..0a187196aeed 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1782,6 +1782,13 @@ static int read_partial_connect(struct ceph_connection *con) if (con->auth) { size = le32_to_cpu(con->in_reply.authorizer_len); + if (size > con->auth->authorizer_reply_buf_len) { + pr_err("authorizer reply too big: %d > %zu\n", size, + con->auth->authorizer_reply_buf_len); + ret = -EINVAL; + goto out; + } + end += size; ret = read_partial(con, end, size, con->auth->authorizer_reply_buf); From f1d10e04637924f2b00a0fecdd2ca4565f5cfc3f Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 27 Jul 2018 19:45:36 +0200 Subject: [PATCH 28/40] libceph: weaken sizeof check in ceph_x_verify_authorizer_reply() Allow for extending ceph_x_authorize_reply in the future. Signed-off-by: Ilya Dryomov Reviewed-by: Sage Weil --- net/ceph/auth_x.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 462786f571e7..b52732337ca6 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -737,8 +737,10 @@ static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac, ret = ceph_x_decrypt(&au->session_key, &p, p + CEPHX_AU_ENC_BUF_LEN); if (ret < 0) return ret; - if (ret != sizeof(*reply)) - return -EPERM; + if (ret < sizeof(*reply)) { + pr_err("bad size %d for ceph_x_authorize_reply\n", ret); + return -EINVAL; + } if (au->nonce + 1 != le64_to_cpu(reply->nonce_plus_one)) ret = -EPERM; From 719784ba706cdbb47ef87483950f0a4594d36e87 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 19 Jul 2018 22:15:24 +0800 Subject: [PATCH 29/40] ceph: add new field max_file_size in ceph_fs_client In order to not bother to VFS and other specific filesystems, we decided to do offset validation inside ceph kernel client, so just simply set sb->s_maxbytes to MAX_LFS_FILESIZE so that it can successfully pass VFS check. We add new field max_file_size in ceph_fs_client to store real file size limit and doing proper check based on it. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 3 ++- fs/ceph/super.c | 3 ++- fs/ceph/super.h | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4fc7582233db..dfbe138472e6 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -4018,7 +4018,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) } else { mdsc->mdsmap = newmap; /* first mds map */ } - mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; + mdsc->fsc->max_file_size = min((loff_t)mdsc->mdsmap->m_max_file_size, + MAX_LFS_FILESIZE); __wake_requests(mdsc, &mdsc->waiting_for_map); ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP, diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 3d8a26b2944f..43ca3b763875 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -940,11 +940,12 @@ static int ceph_set_super(struct super_block *s, void *data) dout("set_super %p data %p\n", s, data); s->s_flags = fsc->mount_options->sb_flags; - s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */ + s->s_maxbytes = MAX_LFS_FILESIZE; s->s_xattr = ceph_xattr_handlers; s->s_fs_info = fsc; fsc->sb = s; + fsc->max_file_size = 1ULL << 40; /* temp value until we get mdsmap */ s->s_op = &ceph_super_ops; s->s_d_op = &ceph_dentry_ops; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index d3eb70e2b527..f09dbf2a2e26 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -98,6 +98,7 @@ struct ceph_fs_client { unsigned long mount_state; int min_caps; /* min caps i added */ + loff_t max_file_size; struct ceph_mds_client *mdsc; From 0671e9968dfb3f99a366df816c361b8e871dba1b Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 19 Jul 2018 22:15:25 +0800 Subject: [PATCH 30/40] ceph: add additional range check in ceph_fallocate() If the range is larger than both real file size and limit of max file size, then return -EFBIG. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2f3a30ca94bf..2c54d2674db6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1728,8 +1728,7 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; @@ -1737,6 +1736,9 @@ static long ceph_fallocate(struct file *file, int mode, loff_t endoff = 0; loff_t size; + if ((offset + length) > max(i_size_read(inode), fsc->max_file_size)) + return -EFBIG; + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -1760,7 +1762,7 @@ static long ceph_fallocate(struct file *file, int mode, goto unlock; } - if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && + if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL) && !(mode & FALLOC_FL_PUNCH_HOLE)) { ret = -ENOSPC; goto unlock; From 8687a3e2c7a026c173ac2e0d65d869c98c154a3a Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 19 Jul 2018 22:15:26 +0800 Subject: [PATCH 31/40] ceph: add additional offset check in ceph_write_iter() If the offset is larger or equal to both real file size and max file size, then return -EFBIG. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2c54d2674db6..b9cd9d271dcb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1384,12 +1384,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_file_info *fi = file->private_data; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_osd_client *osdc = - &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; loff_t pos; + loff_t limit = max(i_size_read(inode), fsc->max_file_size); if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -1415,6 +1415,13 @@ retry_snap: goto out; pos = iocb->ki_pos; + if (unlikely(pos >= limit)) { + err = -EFBIG; + goto out; + } else { + iov_iter_truncate(from, limit - pos); + } + count = iov_iter_count(from); if (ceph_quota_is_max_bytes_exceeded(inode, pos + count)) { err = -EDQUOT; @@ -1436,7 +1443,7 @@ retry_snap: } /* FIXME: not complete since it doesn't account for being at quota */ - if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { + if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; } @@ -1526,7 +1533,7 @@ retry_snap: } if (written >= 0) { - if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_NEARFULL)) + if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL)) iocb->ki_flags |= IOCB_DSYNC; written = generic_write_sync(iocb, written); } From 36a4c72d1c6f5f50d4db14a38f296855ae82571b Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sun, 5 Aug 2018 19:33:01 +0800 Subject: [PATCH 32/40] ceph: add additional size check in ceph_setattr() ceph_setattr() finally calls vfs function inode_newsize_ok() to do offset validation and that is based on sb->s_maxbytes. Because we set sb->s_maxbytes to MAX_LFS_FILESIZE to through VFS check and do proper offset validation in cephfs level, we need adding proper offset validation before calling inode_newsize_ok(). Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d62f65f2875d..ebc7bdaed2d0 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2157,6 +2157,7 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) int ceph_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); int err; if (ceph_snap(inode) != CEPH_NOSNAP) @@ -2166,6 +2167,10 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; + if ((attr->ia_valid & ATTR_SIZE) && + attr->ia_size > max(inode->i_size, fsc->max_file_size)) + return -EFBIG; + if ((attr->ia_valid & ATTR_SIZE) && ceph_quota_is_max_bytes_exceeded(inode, attr->ia_size)) return -EDQUOT; From 9da12e3a7de1665a14063653b60e872da0ee7810 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 19 Jul 2018 22:15:27 +0800 Subject: [PATCH 33/40] ceph: compare fsc->max_file_size and inode->i_size for max file size limit In ceph_llseek(), we compare fsc->max_file_size and inode->i_size to choose max file size limit. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b9cd9d271dcb..141a64c1f391 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -1554,6 +1554,7 @@ out_unlocked: static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); loff_t i_size; loff_t ret; @@ -1598,7 +1599,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) break; } - ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); + ret = vfs_setpos(file, offset, max(i_size, fsc->max_file_size)); out: inode_unlock(inode); From d5548492902967dd088bd0a21df7d047df10f9f6 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 28 Jul 2018 16:30:48 +0800 Subject: [PATCH 34/40] ceph: change to void return type for __do_request() We do not check return code for __do_request() in all callers, so change to void return type. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index dfbe138472e6..8dbfca01c7ea 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2227,7 +2227,7 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, /* * send request, or put it on the appropriate wait list. */ -static int __do_request(struct ceph_mds_client *mdsc, +static void __do_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { struct ceph_mds_session *session = NULL; @@ -2237,7 +2237,7 @@ static int __do_request(struct ceph_mds_client *mdsc, if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) __unregister_request(mdsc, req); - goto out; + return; } if (req->r_timeout && @@ -2260,7 +2260,7 @@ static int __do_request(struct ceph_mds_client *mdsc, if (mdsc->mdsmap->m_epoch == 0) { dout("do_request no mdsmap, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); - goto finish; + return; } if (!(mdsc->fsc->mount_options->flags & CEPH_MOUNT_OPT_MOUNTWAIT) && @@ -2278,7 +2278,7 @@ static int __do_request(struct ceph_mds_client *mdsc, ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { dout("do_request no mds or not active, waiting for map\n"); list_add(&req->r_wait, &mdsc->waiting_for_map); - goto out; + return; } /* get, open session */ @@ -2328,8 +2328,7 @@ finish: complete_request(mdsc, req); __unregister_request(mdsc, req); } -out: - return err; + return; } /* From 7bf8f736c8e0f2e854d41838eed12e317fb29963 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 28 Jul 2018 23:15:35 +0800 Subject: [PATCH 35/40] ceph: refactor ceph_unreserve_caps() The code of ceph_unreserve_caps() and error handling in ceph_reserve_caps() are duplicated, so introduce a helper __ceph_unreserve_caps() to reduce duplicated code. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 66 +++++++++++++++++++++++++++---------------------- fs/ceph/super.h | 2 +- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index f50cc008632a..fb4a19ee5b6d 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -156,6 +156,37 @@ void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta) spin_unlock(&mdsc->caps_list_lock); } +static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps) +{ + struct ceph_cap *cap; + int i; + + if (nr_caps) { + BUG_ON(mdsc->caps_reserve_count < nr_caps); + mdsc->caps_reserve_count -= nr_caps; + if (mdsc->caps_avail_count >= + mdsc->caps_reserve_count + mdsc->caps_min_count) { + mdsc->caps_total_count -= nr_caps; + for (i = 0; i < nr_caps; i++) { + cap = list_first_entry(&mdsc->caps_list, + struct ceph_cap, caps_item); + list_del(&cap->caps_item); + kmem_cache_free(ceph_cap_cachep, cap); + } + } else { + mdsc->caps_avail_count += nr_caps; + } + + dout("%s: caps %d = %d used + %d resv + %d avail\n", + __func__, + mdsc->caps_total_count, mdsc->caps_use_count, + mdsc->caps_reserve_count, mdsc->caps_avail_count); + BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + + mdsc->caps_reserve_count + + mdsc->caps_avail_count); + } +} + /* * Called under mdsc->mutex. */ @@ -283,39 +314,14 @@ out_nomem: return -ENOMEM; } -int ceph_unreserve_caps(struct ceph_mds_client *mdsc, +void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx) { - int i; - struct ceph_cap *cap; - dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count); - if (ctx->count) { - spin_lock(&mdsc->caps_list_lock); - BUG_ON(mdsc->caps_reserve_count < ctx->count); - mdsc->caps_reserve_count -= ctx->count; - if (mdsc->caps_avail_count >= - mdsc->caps_reserve_count + mdsc->caps_min_count) { - mdsc->caps_total_count -= ctx->count; - for (i = 0; i < ctx->count; i++) { - cap = list_first_entry(&mdsc->caps_list, - struct ceph_cap, caps_item); - list_del(&cap->caps_item); - kmem_cache_free(ceph_cap_cachep, cap); - } - } else { - mdsc->caps_avail_count += ctx->count; - } - ctx->count = 0; - dout("unreserve caps %d = %d used + %d resv + %d avail\n", - mdsc->caps_total_count, mdsc->caps_use_count, - mdsc->caps_reserve_count, mdsc->caps_avail_count); - BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + - mdsc->caps_reserve_count + - mdsc->caps_avail_count); - spin_unlock(&mdsc->caps_list_lock); - } - return 0; + spin_lock(&mdsc->caps_list_lock); + __ceph_unreserve_caps(mdsc, ctx->count); + ctx->count = 0; + spin_unlock(&mdsc->caps_list_lock); } struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index f09dbf2a2e26..9b5f15d20b33 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -656,7 +656,7 @@ extern void ceph_caps_finalize(struct ceph_mds_client *mdsc); extern void ceph_adjust_min_caps(struct ceph_mds_client *mdsc, int delta); extern int ceph_reserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx, int need); -extern int ceph_unreserve_caps(struct ceph_mds_client *mdsc, +extern void ceph_unreserve_caps(struct ceph_mds_client *mdsc, struct ceph_cap_reservation *ctx); extern void ceph_reservation_status(struct ceph_fs_client *client, int *total, int *avail, int *used, From e5bc08d09f5fac5cd2901f34b510e67c524894ab Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Sat, 28 Jul 2018 23:15:37 +0800 Subject: [PATCH 36/40] ceph: refactor error handling code in ceph_reserve_caps() Call new helper __ceph_unreserve_caps() to reduce duplicated code. Signed-off-by: Chengguang Xu Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index fb4a19ee5b6d..dd7dfdd2ba13 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -198,6 +198,7 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, int have; int alloc = 0; int max_caps; + int err = 0; bool trimmed = false; struct ceph_mds_session *s; LIST_HEAD(newcaps); @@ -264,9 +265,14 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n", ctx, need, have + alloc); - goto out_nomem; + err = -ENOMEM; + break; + } + + if (!err) { + BUG_ON(have + alloc != need); + ctx->count = need; } - BUG_ON(have + alloc != need); spin_lock(&mdsc->caps_list_lock); mdsc->caps_total_count += alloc; @@ -276,42 +282,16 @@ int ceph_reserve_caps(struct ceph_mds_client *mdsc, BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + mdsc->caps_reserve_count + mdsc->caps_avail_count); + + if (err) + __ceph_unreserve_caps(mdsc, have + alloc); + spin_unlock(&mdsc->caps_list_lock); - ctx->count = need; dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n", ctx, mdsc->caps_total_count, mdsc->caps_use_count, mdsc->caps_reserve_count, mdsc->caps_avail_count); - return 0; - -out_nomem: - - spin_lock(&mdsc->caps_list_lock); - mdsc->caps_avail_count += have; - mdsc->caps_reserve_count -= have; - - while (!list_empty(&newcaps)) { - cap = list_first_entry(&newcaps, - struct ceph_cap, caps_item); - list_del(&cap->caps_item); - - /* Keep some preallocated caps around (ceph_min_count), to - * avoid lots of free/alloc churn. */ - if (mdsc->caps_avail_count >= - mdsc->caps_reserve_count + mdsc->caps_min_count) { - kmem_cache_free(ceph_cap_cachep, cap); - } else { - mdsc->caps_avail_count++; - mdsc->caps_total_count++; - list_add(&cap->caps_item, &mdsc->caps_list); - } - } - - BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count + - mdsc->caps_reserve_count + - mdsc->caps_avail_count); - spin_unlock(&mdsc->caps_list_lock); - return -ENOMEM; + return err; } void ceph_unreserve_caps(struct ceph_mds_client *mdsc, From bad87216fb8426302cd53a5fcfdd45c3a5d54383 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Tue, 7 Aug 2018 21:02:34 +0800 Subject: [PATCH 37/40] libceph: remove unnecessary non NULL check for request_key request_key never return NULL,so no need do non-NULL check. Signed-off-by: YueHaibing Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/ceph_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 6ebd7d953ee0..87afb9ec4c68 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -304,7 +304,7 @@ static int get_secret(struct ceph_crypto_key *dst, const char *name) { struct ceph_crypto_key *ckey; ukey = request_key(&key_type_ceph, name, NULL); - if (!ukey || IS_ERR(ukey)) { + if (IS_ERR(ukey)) { /* request_key errors don't map nicely to mount(2) errors; don't even try, but still printk */ key_err = PTR_ERR(ukey); From 4de17aea5ceffe6d8350fde9d903a02037b221c6 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 8 Aug 2018 19:52:57 +0800 Subject: [PATCH 38/40] crush: fix using plain integer as NULL warning Fixes the following sparse warnings: net/ceph/crush/mapper.c:517:76: warning: Using plain integer as NULL pointer net/ceph/crush/mapper.c:728:68: warning: Using plain integer as NULL pointer Signed-off-by: YueHaibing Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crush/mapper.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c index 417df675c71b..3f323ed9df52 100644 --- a/net/ceph/crush/mapper.c +++ b/net/ceph/crush/mapper.c @@ -514,7 +514,7 @@ static int crush_choose_firstn(const struct crush_map *map, in, work->work[-1-in->id], x, r, (choose_args ? - &choose_args[-1-in->id] : 0), + &choose_args[-1-in->id] : NULL), outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); @@ -725,7 +725,7 @@ static void crush_choose_indep(const struct crush_map *map, in, work->work[-1-in->id], x, r, (choose_args ? - &choose_args[-1-in->id] : 0), + &choose_args[-1-in->id] : NULL), outpos); if (item >= map->max_devices) { dprintk(" bad item %d\n", item); From 342ce1823ebaec573ac269b56bca78c698fec5c3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 11 May 2018 18:47:29 +0800 Subject: [PATCH 39/40] ceph: support cephfs' own feature bits Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 48 +++++++++++++++++++++++++++++++++++--------- fs/ceph/mds_client.h | 12 +++++++++++ 2 files changed, 51 insertions(+), 9 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8dbfca01c7ea..5b767cf1f780 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -902,6 +902,27 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) return msg; } +static void encode_supported_features(void **p, void *end) +{ + static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; + static const size_t count = ARRAY_SIZE(bits); + + if (count > 0) { + size_t i; + size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8; + + BUG_ON(*p + 4 + size > end); + ceph_encode_32(p, size); + memset(*p, 0, size); + for (i = 0; i < count; i++) + ((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8); + *p += size; + } else { + BUG_ON(*p + 4 > end); + ceph_encode_32(p, 0); + } +} + /* * session message, specialization for CEPH_SESSION_REQUEST_OPEN * to include additional client metadata fields. @@ -911,11 +932,11 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 struct ceph_msg *msg; struct ceph_mds_session_head *h; int i = -1; - int metadata_bytes = 0; + int extra_bytes = 0; int metadata_key_count = 0; struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; - void *p; + void *p, *end; const char* metadata[][2] = { {"hostname", mdsc->nodename}, @@ -926,21 +947,26 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 }; /* Calculate serialized length of metadata */ - metadata_bytes = 4; /* map length */ + extra_bytes = 4; /* map length */ for (i = 0; metadata[i][0]; ++i) { - metadata_bytes += 8 + strlen(metadata[i][0]) + + extra_bytes += 8 + strlen(metadata[i][0]) + strlen(metadata[i][1]); metadata_key_count++; } + /* supported feature */ + extra_bytes += 4 + 8; /* Allocate the message */ - msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes, + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, GFP_NOFS, false); if (!msg) { pr_err("create_session_msg ENOMEM creating msg\n"); return NULL; } - h = msg->front.iov_base; + p = msg->front.iov_base; + end = p + msg->front.iov_len; + + h = p; h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); h->seq = cpu_to_le64(seq); @@ -950,11 +976,11 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 * * ClientSession messages with metadata are v2 */ - msg->hdr.version = cpu_to_le16(2); + msg->hdr.version = cpu_to_le16(3); msg->hdr.compat_version = cpu_to_le16(1); /* The write pointer, following the session_head structure */ - p = msg->front.iov_base + sizeof(*h); + p += sizeof(*h); /* Number of entries in the map */ ceph_encode_32(&p, metadata_key_count); @@ -972,6 +998,10 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 p += val_len; } + encode_supported_features(&p, end); + msg->front.iov_len = p - msg->front.iov_base; + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + return msg; } @@ -2749,7 +2779,7 @@ static void handle_session(struct ceph_mds_session *session, int wake = 0; /* decode */ - if (msg->front.iov_len != sizeof(*h)) + if (msg->front.iov_len < sizeof(*h)) goto bad; op = le32_to_cpu(h->op); seq = le64_to_cpu(h->seq); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 80a7523e1523..32fcce0d4d3c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -16,6 +16,18 @@ #include #include +/* The first 8 bits are reserved for old ceph releases */ +#define CEPHFS_FEATURE_MIMIC 8 + +#define CEPHFS_FEATURES_ALL { \ + 0, 1, 2, 3, 4, 5, 6, 7, \ + CEPHFS_FEATURE_MIMIC, \ +} + +#define CEPHFS_FEATURES_CLIENT_SUPPORTED CEPHFS_FEATURES_ALL +#define CEPHFS_FEATURES_CLIENT_REQUIRED {} + + /* * Some lock dependencies: * From 0fcf6c02b205f80f24eb548b236543ec151cb01c Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 3 Aug 2018 16:24:49 +0800 Subject: [PATCH 40/40] ceph: don't drop message if it contains more data than expected Later version mds may encode more data into messages. Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 6 +++--- fs/ceph/quota.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 5b767cf1f780..bc43c822426a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3406,10 +3406,10 @@ static void handle_lease(struct ceph_mds_client *mdsc, vino.ino = le64_to_cpu(h->ino); vino.snap = CEPH_NOSNAP; seq = le32_to_cpu(h->seq); - dname.name = (void *)h + sizeof(*h) + sizeof(u32); - dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32); - if (dname.len != get_unaligned_le32(h+1)) + dname.len = get_unaligned_le32(h + 1); + if (msg->front.iov_len < sizeof(*h) + sizeof(u32) + dname.len) goto bad; + dname.name = (void *)(h + 1) + sizeof(u32); /* lookup inode */ inode = ceph_find_inode(sb, vino); diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 242bfa5c0539..32d4f13784ba 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -48,7 +48,7 @@ void ceph_handle_quota(struct ceph_mds_client *mdsc, struct inode *inode; struct ceph_inode_info *ci; - if (msg->front.iov_len != sizeof(*h)) { + if (msg->front.iov_len < sizeof(*h)) { pr_err("%s corrupt message mds%d len %d\n", __func__, session->s_mds, (int)msg->front.iov_len); ceph_msg_dump(msg);