From 1535212c542285e430d44a75bfc0a99df610f598 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Mon, 2 May 2016 11:33:12 -0400 Subject: [PATCH] md-cluster: fix locking when node joins cluster during message broadcast If a node joins the cluster while a message broadcast is under way, a lock issue could happen as follows. For a cluster which included two nodes, if node A is calling __sendmsg before up-convert CR to EX on ack, and node B released CR on ack. But if a new node C joins the cluster and it doesn't receive the message which A sent before, so it could hold CR on ack before A up-convert CR to EX on ack. So a node joining the cluster should get an EX lock on the "token" first to ensure no broadcast is ongoing, then release it after held CR on ack. Reviewed-by: NeilBrown Signed-off-by: Guoqing Jiang Signed-off-by: Shaohua Li --- drivers/md/md-cluster.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 76f88f731aa1..30f1160142c1 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -781,17 +781,24 @@ static int join(struct mddev *mddev, int nodes) cinfo->token_lockres = lockres_init(mddev, "token", NULL, 0); if (!cinfo->token_lockres) goto err; - cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); - if (!cinfo->ack_lockres) - goto err; cinfo->no_new_dev_lockres = lockres_init(mddev, "no-new-dev", NULL, 0); if (!cinfo->no_new_dev_lockres) goto err; + ret = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); + if (ret) { + ret = -EAGAIN; + pr_err("md-cluster: can't join cluster to avoid lock issue\n"); + goto err; + } + cinfo->ack_lockres = lockres_init(mddev, "ack", ack_bast, 0); + if (!cinfo->ack_lockres) + goto err; /* get sync CR lock on ACK. */ if (dlm_lock_sync(cinfo->ack_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on ACK!(%d)\n", ret); + dlm_unlock_sync(cinfo->token_lockres); /* get sync CR lock on no-new-dev. */ if (dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR)) pr_err("md-cluster: failed to get a sync CR lock on no-new-dev!(%d)\n", ret);