A fix for the recently discovered misdirected requests bug present in

jewel and later on the server side and all stable kernels, a fixup for
 -rc1 CRUSH changes and two usability enhancements: osd_request_timeout
 option and supported_features bus attribute.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2
 
 iQEcBAABCAAGBQJYwsEIAAoJEEp/3jgCEfOL34sH+wbYyT6uXQ3hlIoRt2FQNh5b
 F6qmvH4jYRI+YyjJHgE7lLEv7cq/PESPej2hrw9U7GAso0KEsazOv+qpj4AcW+u1
 arXYTIQQa2w9sCuj7/BrbEzDtnNOVnGyD3Ng0wAfvbxg/37xzqumkbccuWJm6GdH
 Vjk31G4ZmaOOr38jeo0AkYWgs7kgfthLMFo73TgHTBBO9fkQQQL1xZH5D/Irzf8P
 1ytfVyGeTl8D3szdkkOnc4eUFMwJ35wqesL+gAsQntx1/wDnGqa2IabXRs4oqr8F
 oT88LXSP8w2PaFKI1FrwOuMov6ngg38tir2SMxGDIQ6TdxtK8lW37Cx3eHavqtE=
 =f4Bs
 -----END PGP SIGNATURE-----

Merge tag 'ceph-for-4.11-rc2' of git://github.com/ceph/ceph-client

Pull ceph fixes from Ilya Dryomov:

 - a fix for the recently discovered misdirected requests bug present in
   jewel and later on the server side and all stable kernels

 - a fixup for -rc1 CRUSH changes

 - two usability enhancements: osd_request_timeout option and
   supported_features bus attribute.

* tag 'ceph-for-4.11-rc2' of git://github.com/ceph/ceph-client:
  libceph: osd_request_timeout option
  rbd: supported_features bus attribute
  libceph: don't set weight to IN when OSD is destroyed
  libceph: fix crush_decode() for older maps
This commit is contained in:
Linus Torvalds 2017-03-10 11:05:47 -08:00
commit 24c534bb16
6 changed files with 66 additions and 8 deletions

View File

@ -120,10 +120,11 @@ static int atomic_dec_return_safe(atomic_t *v)
/* Feature bits */ /* Feature bits */
#define RBD_FEATURE_LAYERING (1<<0) #define RBD_FEATURE_LAYERING (1ULL<<0)
#define RBD_FEATURE_STRIPINGV2 (1<<1) #define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
#define RBD_FEATURE_EXCLUSIVE_LOCK (1<<2) #define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
#define RBD_FEATURE_DATA_POOL (1<<7) #define RBD_FEATURE_DATA_POOL (1ULL<<7)
#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \ #define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
RBD_FEATURE_STRIPINGV2 | \ RBD_FEATURE_STRIPINGV2 | \
RBD_FEATURE_EXCLUSIVE_LOCK | \ RBD_FEATURE_EXCLUSIVE_LOCK | \
@ -499,16 +500,23 @@ static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
return is_lock_owner; return is_lock_owner;
} }
static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
{
return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
}
static BUS_ATTR(add, S_IWUSR, NULL, rbd_add); static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove); static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major); static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major); static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
static struct attribute *rbd_bus_attrs[] = { static struct attribute *rbd_bus_attrs[] = {
&bus_attr_add.attr, &bus_attr_add.attr,
&bus_attr_remove.attr, &bus_attr_remove.attr,
&bus_attr_add_single_major.attr, &bus_attr_add_single_major.attr,
&bus_attr_remove_single_major.attr, &bus_attr_remove_single_major.attr,
&bus_attr_supported_features.attr,
NULL, NULL,
}; };

View File

@ -48,6 +48,7 @@ struct ceph_options {
unsigned long mount_timeout; /* jiffies */ unsigned long mount_timeout; /* jiffies */
unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */
unsigned long osd_request_timeout; /* jiffies */
/* /*
* any type that can't be simply compared or doesn't need need * any type that can't be simply compared or doesn't need need
@ -68,6 +69,7 @@ struct ceph_options {
#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0 /* no timeout */
#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)

View File

@ -189,6 +189,7 @@ struct ceph_osd_request {
/* internal */ /* internal */
unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_stamp; /* jiffies, send or check time */
unsigned long r_start_stamp; /* jiffies */
int r_attempts; int r_attempts;
struct ceph_eversion r_replay_version; /* aka reassert_version */ struct ceph_eversion r_replay_version; /* aka reassert_version */
u32 r_last_force_resend; u32 r_last_force_resend;

View File

@ -230,6 +230,7 @@ enum {
Opt_osdkeepalivetimeout, Opt_osdkeepalivetimeout,
Opt_mount_timeout, Opt_mount_timeout,
Opt_osd_idle_ttl, Opt_osd_idle_ttl,
Opt_osd_request_timeout,
Opt_last_int, Opt_last_int,
/* int args above */ /* int args above */
Opt_fsid, Opt_fsid,
@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
{Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
{Opt_mount_timeout, "mount_timeout=%d"}, {Opt_mount_timeout, "mount_timeout=%d"},
{Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
{Opt_osd_request_timeout, "osd_request_timeout=%d"},
/* int args above */ /* int args above */
{Opt_fsid, "fsid=%s"}, {Opt_fsid, "fsid=%s"},
{Opt_name, "name=%s"}, {Opt_name, "name=%s"},
@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
/* get mon ip(s) */ /* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */ /* ip1[:port1][,ip2[:port2]...] */
@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
} }
opt->mount_timeout = msecs_to_jiffies(intval * 1000); opt->mount_timeout = msecs_to_jiffies(intval * 1000);
break; break;
case Opt_osd_request_timeout:
/* 0 is "wait forever" (i.e. infinite timeout) */
if (intval < 0 || intval > INT_MAX / 1000) {
pr_err("osd_request_timeout out of range\n");
err = -EINVAL;
goto out;
}
opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
break;
case Opt_share: case Opt_share:
opt->flags &= ~CEPH_OPT_NOSHARE; opt->flags &= ~CEPH_OPT_NOSHARE;
@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT) if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
seq_printf(m, "osdkeepalivetimeout=%d,", seq_printf(m, "osdkeepalivetimeout=%d,",
jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000); jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
seq_printf(m, "osd_request_timeout=%d,",
jiffies_to_msecs(opt->osd_request_timeout) / 1000);
/* drop redundant comma */ /* drop redundant comma */
if (m->count != pos) if (m->count != pos)

View File

@ -1709,6 +1709,8 @@ static void account_request(struct ceph_osd_request *req)
req->r_flags |= CEPH_OSD_FLAG_ONDISK; req->r_flags |= CEPH_OSD_FLAG_ONDISK;
atomic_inc(&req->r_osdc->num_requests); atomic_inc(&req->r_osdc->num_requests);
req->r_start_stamp = jiffies;
} }
static void submit_request(struct ceph_osd_request *req, bool wrlocked) static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@ -1789,6 +1791,14 @@ static void cancel_request(struct ceph_osd_request *req)
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
} }
static void abort_request(struct ceph_osd_request *req, int err)
{
dout("%s req %p tid %llu err %d\n", __func__, req, req->r_tid, err);
cancel_map_check(req);
complete_request(req, err);
}
static void check_pool_dne(struct ceph_osd_request *req) static void check_pool_dne(struct ceph_osd_request *req)
{ {
struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd_client *osdc = req->r_osdc;
@ -2487,6 +2497,7 @@ static void handle_timeout(struct work_struct *work)
container_of(work, struct ceph_osd_client, timeout_work.work); container_of(work, struct ceph_osd_client, timeout_work.work);
struct ceph_options *opts = osdc->client->options; struct ceph_options *opts = osdc->client->options;
unsigned long cutoff = jiffies - opts->osd_keepalive_timeout; unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
LIST_HEAD(slow_osds); LIST_HEAD(slow_osds);
struct rb_node *n, *p; struct rb_node *n, *p;
@ -2502,15 +2513,23 @@ static void handle_timeout(struct work_struct *work)
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
bool found = false; bool found = false;
for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) { for (p = rb_first(&osd->o_requests); p; ) {
struct ceph_osd_request *req = struct ceph_osd_request *req =
rb_entry(p, struct ceph_osd_request, r_node); rb_entry(p, struct ceph_osd_request, r_node);
p = rb_next(p); /* abort_request() */
if (time_before(req->r_stamp, cutoff)) { if (time_before(req->r_stamp, cutoff)) {
dout(" req %p tid %llu on osd%d is laggy\n", dout(" req %p tid %llu on osd%d is laggy\n",
req, req->r_tid, osd->o_osd); req, req->r_tid, osd->o_osd);
found = true; found = true;
} }
if (opts->osd_request_timeout &&
time_before(req->r_start_stamp, expiry_cutoff)) {
pr_err_ratelimited("tid %llu on osd%d timeout\n",
req->r_tid, osd->o_osd);
abort_request(req, -ETIMEDOUT);
}
} }
for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) { for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
struct ceph_osd_linger_request *lreq = struct ceph_osd_linger_request *lreq =
@ -2530,6 +2549,21 @@ static void handle_timeout(struct work_struct *work)
list_move_tail(&osd->o_keepalive_item, &slow_osds); list_move_tail(&osd->o_keepalive_item, &slow_osds);
} }
if (opts->osd_request_timeout) {
for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
struct ceph_osd_request *req =
rb_entry(p, struct ceph_osd_request, r_node);
p = rb_next(p); /* abort_request() */
if (time_before(req->r_start_stamp, expiry_cutoff)) {
pr_err_ratelimited("tid %llu on osd%d timeout\n",
req->r_tid, osdc->homeless_osd.o_osd);
abort_request(req, -ETIMEDOUT);
}
}
}
if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds)) if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
maybe_request_map(osdc); maybe_request_map(osdc);

View File

@ -390,9 +390,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
dout("crush decode tunable chooseleaf_stable = %d\n", dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable); c->chooseleaf_stable);
crush_finalize(c);
done: done:
crush_finalize(c);
dout("crush_decode success\n"); dout("crush_decode success\n");
return c; return c;
@ -1380,7 +1379,6 @@ static int decode_new_up_state_weight(void **p, void *end,
if ((map->osd_state[osd] & CEPH_OSD_EXISTS) && if ((map->osd_state[osd] & CEPH_OSD_EXISTS) &&
(xorstate & CEPH_OSD_EXISTS)) { (xorstate & CEPH_OSD_EXISTS)) {
pr_info("osd%d does not exist\n", osd); pr_info("osd%d does not exist\n", osd);
map->osd_weight[osd] = CEPH_OSD_IN;
ret = set_primary_affinity(map, osd, ret = set_primary_affinity(map, osd,
CEPH_OSD_DEFAULT_PRIMARY_AFFINITY); CEPH_OSD_DEFAULT_PRIMARY_AFFINITY);
if (ret) if (ret)