From efad7e6b1a28be599836c8f15ec04f99a98fb04c Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 14 Jun 2013 04:56:09 -0500 Subject: [PATCH 01/10] dlm: clear correct init bit during sctp setup We were clearing the base con's init pending flags, but the con for the node was the one with the pending bit set. Signed-off-by: Mike Christie Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d0ccd2fd79eb..efbe7af42002 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -664,7 +664,7 @@ static void process_sctp_notification(struct connection *con, /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); - clear_bit(CF_INIT_PENDING, &con->flags); + clear_bit(CF_INIT_PENDING, &new_con->flags); if (!test_and_set_bit(CF_WRITE_PENDING, &new_con->flags)) { queue_work(send_workqueue, &new_con->swork); } From e1631d0c48ca5ba9878f5923ffe58ef6fd2d5fda Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 14 Jun 2013 04:56:10 -0500 Subject: [PATCH 02/10] dlm: set sctp assoc id during setup sctp_assoc was not getting set so later lookups failed. Signed-off-by: Mike Christie Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index efbe7af42002..1536599fde8c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -662,6 +662,7 @@ static void process_sctp_notification(struct connection *con, log_print("connecting to %d sctp association %d", nodeid, (int)sn->sn_assoc_change.sac_assoc_id); + new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); clear_bit(CF_INIT_PENDING, &new_con->flags); From b390ca38d27bd3d2f409e64a6f13d6ff67eb4825 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 14 Jun 2013 04:56:11 -0500 Subject: [PATCH 03/10] dlm: clear correct bit during sctp init failure handling We should be testing and cleaing the init pending bit because later when sctp_init_assoc is recalled it will be checking that it is not set and set the bit. We do not want to touch CF_CONNECT_PENDING here because we will queue swork and process_send_sockets will then call the connect_action function. Signed-off-by: Mike Christie Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 1536599fde8c..87e68dd01479 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -562,7 +562,7 @@ static void sctp_send_shutdown(sctp_assoc_t associd) static void sctp_init_failed_foreach(struct connection *con) { con->sctp_assoc = 0; - if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) { + if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) queue_work(send_workqueue, &con->swork); } From 98e1b60ecc441625c91013e88f14cbd1b3c1fa08 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 14 Jun 2013 04:56:12 -0500 Subject: [PATCH 04/10] dlm: try other IPs when sctp init assoc fails Currently, if we cannot create a association to the first IP addr that is added to DLM, the SCTP init assoc code will just retry the same IP. This patch adds a simple failover schemes where we will try one of the addresses that was passed into DLM. Signed-off-by: Mike Christie Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 61 ++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 87e68dd01479..56015c9e8d00 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -126,6 +126,7 @@ struct connection { struct connection *othercon; struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ + bool try_new_addr; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -144,6 +145,7 @@ struct dlm_node_addr { struct list_head list; int nodeid; int addr_count; + int curr_addr_index; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; }; @@ -310,7 +312,7 @@ static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, - struct sockaddr *sa_out) + struct sockaddr *sa_out, bool try_new_addr) { struct sockaddr_storage sas; struct dlm_node_addr *na; @@ -320,8 +322,16 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, spin_lock(&dlm_node_addrs_spin); na = find_node_addr(nodeid); - if (na && na->addr_count) - memcpy(&sas, na->addr[0], sizeof(struct sockaddr_storage)); + if (na && na->addr_count) { + if (try_new_addr) { + na->curr_addr_index++; + if (na->curr_addr_index == na->addr_count) + na->curr_addr_index = 0; + } + + memcpy(&sas, na->addr[na->curr_addr_index ], + sizeof(struct sockaddr_storage)); + } spin_unlock(&dlm_node_addrs_spin); if (!na) @@ -353,19 +363,22 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) { struct dlm_node_addr *na; int rv = -EEXIST; + int addr_i; spin_lock(&dlm_node_addrs_spin); list_for_each_entry(na, &dlm_node_addrs, list) { if (!na->addr_count) continue; - if (!addr_compare(na->addr[0], addr)) - continue; - - *nodeid = na->nodeid; - rv = 0; - break; + for (addr_i = 0; addr_i < na->addr_count; addr_i++) { + if (addr_compare(na->addr[addr_i], addr)) { + *nodeid = na->nodeid; + rv = 0; + goto unlock; + } + } } +unlock: spin_unlock(&dlm_node_addrs_spin); return rv; } @@ -561,6 +574,21 @@ static void sctp_send_shutdown(sctp_assoc_t associd) static void sctp_init_failed_foreach(struct connection *con) { + + /* + * Don't try to recover base con and handle race where the + * other node's assoc init creates a assoc and we get that + * notification, then we get a notification that our attempt + * failed due. This happens when we are still trying the primary + * address, but the other node has already tried secondary addrs + * and found one that worked. + */ + if (!con->nodeid || con->sctp_assoc) + return; + + log_print("Retrying SCTP association init for node %d\n", con->nodeid); + + con->try_new_addr = true; con->sctp_assoc = 0; if (test_and_clear_bit(CF_INIT_PENDING, &con->flags)) { if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) @@ -663,6 +691,7 @@ static void process_sctp_notification(struct connection *con, nodeid, (int)sn->sn_assoc_change.sac_assoc_id); new_con->sctp_assoc = sn->sn_assoc_change.sac_assoc_id; + new_con->try_new_addr = false; /* Send any pending writes */ clear_bit(CF_CONNECT_PENDING, &new_con->flags); clear_bit(CF_INIT_PENDING, &new_con->flags); @@ -984,7 +1013,8 @@ static void sctp_init_assoc(struct connection *con) if (con->retries++ > MAX_CONNECT_RETRIES) return; - if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr)) { + if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, + con->try_new_addr)) { log_print("no address for nodeid %d", con->nodeid); return; } @@ -1016,6 +1046,14 @@ static void sctp_init_assoc(struct connection *con) iov[0].iov_base = page_address(e->page)+offset; iov[0].iov_len = len; + if (rem_addr.ss_family == AF_INET) { + struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; + log_print("Trying to connect to %pI4", &sin->sin_addr.s_addr); + } else { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)&rem_addr; + log_print("Trying to connect to %pI6", &sin6->sin6_addr); + } + cmsg = CMSG_FIRSTHDR(&outmessage); cmsg->cmsg_level = IPPROTO_SCTP; cmsg->cmsg_type = SCTP_SNDRCV; @@ -1024,6 +1062,7 @@ static void sctp_init_assoc(struct connection *con) memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); outmessage.msg_controllen = cmsg->cmsg_len; + sinfo->sinfo_flags |= SCTP_ADDR_OVER; ret = kernel_sendmsg(base_con->sock, &outmessage, iov, 1, len); if (ret < 0) { @@ -1076,7 +1115,7 @@ static void tcp_connect_to_sock(struct connection *con) goto out_err; memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out_err; From 5d6898714fe2ce485e95ac74479ed40ebd8d5748 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 14 Jun 2013 04:56:13 -0500 Subject: [PATCH 05/10] dlm: retry failed SCTP sends Currently if a SCTP send fails, we lose the data we were trying to send because the writequeue_entry is released when we do the send. When this happens other nodes will then hang waiting for a reply. This adds support for SCTP to retry the send operation. I also removed the retry limit for SCTP use, because we want to make sure we try every path during init time and for longer failures we want to continually retry in case paths come back up while trying other paths. We will do this until userspace tells us to stop. Signed-off-by: Mike Christie Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 104 +++++++++++++++++++++++++++++++++------------- 1 file changed, 75 insertions(+), 29 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 56015c9e8d00..a4fad32bb788 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -607,15 +607,56 @@ static void sctp_init_failed(void) mutex_unlock(&connections_lock); } +static void retry_failed_sctp_send(struct connection *recv_con, + struct sctp_send_failed *sn_send_failed, + char *buf) +{ + int len = sn_send_failed->ssf_length - sizeof(struct sctp_send_failed); + struct dlm_mhandle *mh; + struct connection *con; + char *retry_buf; + int nodeid = sn_send_failed->ssf_info.sinfo_ppid; + + log_print("Retry sending %d bytes to node id %d", len, nodeid); + + con = nodeid2con(nodeid, 0); + if (!con) { + log_print("Could not look up con for nodeid %d\n", + nodeid); + return; + } + + mh = dlm_lowcomms_get_buffer(nodeid, len, GFP_NOFS, &retry_buf); + if (!mh) { + log_print("Could not allocate buf for retry."); + return; + } + memcpy(retry_buf, buf + sizeof(struct sctp_send_failed), len); + dlm_lowcomms_commit_buffer(mh); + + /* + * If we got a assoc changed event before the send failed event then + * we only need to retry the send. + */ + if (con->sctp_assoc) { + if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) + queue_work(send_workqueue, &con->swork); + } else + sctp_init_failed_foreach(con); +} + /* Something happened to an association */ static void process_sctp_notification(struct connection *con, struct msghdr *msg, char *buf) { union sctp_notification *sn = (union sctp_notification *)buf; - if (sn->sn_header.sn_type == SCTP_ASSOC_CHANGE) { + switch (sn->sn_header.sn_type) { + case SCTP_SEND_FAILED: + retry_failed_sctp_send(con, &sn->sn_send_failed, buf); + break; + case SCTP_ASSOC_CHANGE: switch (sn->sn_assoc_change.sac_state) { - case SCTP_COMM_UP: case SCTP_RESTART: { @@ -713,14 +754,10 @@ static void process_sctp_notification(struct connection *con, } break; - /* We don't know which INIT failed, so clear the PENDING flags - * on them all. if assoc_id is zero then it will then try - * again */ - case SCTP_CANT_STR_ASSOC: { + /* Will retry init when we get the send failed notification */ log_print("Can't start SCTP association - retrying"); - sctp_init_failed(); } break; @@ -729,6 +766,8 @@ static void process_sctp_notification(struct connection *con, (int)sn->sn_assoc_change.sac_assoc_id, sn->sn_assoc_change.sac_state); } + default: + ; /* fall through */ } } @@ -988,6 +1027,24 @@ static void free_entry(struct writequeue_entry *e) kfree(e); } +/* + * writequeue_entry_complete - try to delete and free write queue entry + * @e: write queue entry to try to delete + * @completed: bytes completed + * + * writequeue_lock must be held. + */ +static void writequeue_entry_complete(struct writequeue_entry *e, int completed) +{ + e->offset += completed; + e->len -= completed; + + if (e->len == 0 && e->users == 0) { + list_del(&e->list); + free_entry(e); + } +} + /* Initiate an SCTP association. This is a special case of send_to_sock() in that we don't yet have a peeled-off socket for this association, so we use the listening socket @@ -1007,16 +1064,14 @@ static void sctp_init_assoc(struct connection *con) int addrlen; struct kvec iov[1]; + mutex_lock(&con->sock_mutex); if (test_and_set_bit(CF_INIT_PENDING, &con->flags)) - return; - - if (con->retries++ > MAX_CONNECT_RETRIES) - return; + goto unlock; if (nodeid_to_addr(con->nodeid, NULL, (struct sockaddr *)&rem_addr, con->try_new_addr)) { log_print("no address for nodeid %d", con->nodeid); - return; + goto unlock; } base_con = nodeid2con(0, 0); BUG_ON(base_con == NULL); @@ -1034,17 +1089,17 @@ static void sctp_init_assoc(struct connection *con) if (list_empty(&con->writequeue)) { spin_unlock(&con->writequeue_lock); log_print("writequeue empty for nodeid %d", con->nodeid); - return; + goto unlock; } e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; - spin_unlock(&con->writequeue_lock); /* Send the first block off the write queue */ iov[0].iov_base = page_address(e->page)+offset; iov[0].iov_len = len; + spin_unlock(&con->writequeue_lock); if (rem_addr.ss_family == AF_INET) { struct sockaddr_in *sin = (struct sockaddr_in *)&rem_addr; @@ -1060,7 +1115,7 @@ static void sctp_init_assoc(struct connection *con) cmsg->cmsg_len = CMSG_LEN(sizeof(struct sctp_sndrcvinfo)); sinfo = CMSG_DATA(cmsg); memset(sinfo, 0x00, sizeof(struct sctp_sndrcvinfo)); - sinfo->sinfo_ppid = cpu_to_le32(dlm_our_nodeid()); + sinfo->sinfo_ppid = cpu_to_le32(con->nodeid); outmessage.msg_controllen = cmsg->cmsg_len; sinfo->sinfo_flags |= SCTP_ADDR_OVER; @@ -1075,15 +1130,12 @@ static void sctp_init_assoc(struct connection *con) } else { spin_lock(&con->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - } + writequeue_entry_complete(e, ret); spin_unlock(&con->writequeue_lock); } + +unlock: + mutex_unlock(&con->sock_mutex); } /* Connect a new socket to its peer */ @@ -1533,13 +1585,7 @@ static void send_to_sock(struct connection *con) } spin_lock(&con->writequeue_lock); - e->offset += ret; - e->len -= ret; - - if (e->len == 0 && e->users == 0) { - list_del(&e->list); - free_entry(e); - } + writequeue_entry_complete(e, ret); } spin_unlock(&con->writequeue_lock); out: From 86e92ad299fb0be359efdd61812944497d4d8d52 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Fri, 14 Jun 2013 04:56:14 -0500 Subject: [PATCH 06/10] dlm: disable nagle for SCTP For TCP we disable Nagle and I cannot think of why it would be needed for SCTP. When disabled it seems to improve dlm_lock operations like it does for TCP. Signed-off-by: Mike Christie Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index a4fad32bb788..4f539dd9b1e9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1346,6 +1346,7 @@ static int sctp_listen_for_all(void) int result = -EINVAL, num = 1, i, addr_len; struct connection *con = nodeid2con(0, GFP_NOFS); int bufsize = NEEDED_RMEM; + int one = 1; if (!con) return -ENOMEM; @@ -1380,6 +1381,11 @@ static int sctp_listen_for_all(void) goto create_delsock; } + result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, + sizeof(one)); + if (result < 0) + log_print("Could not set SCTP NODELAY error %d\n", result); + /* Init con struct */ sock->sk->sk_user_data = con; con->sock = sock; From 06452eb0538827d2158945d20e3d33e359884437 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 19 Jun 2013 10:49:50 +0800 Subject: [PATCH 07/10] dlm: remove duplicated include from lowcomms.c Remove duplicated include. Signed-off-by: Wei Yongjun Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 4f539dd9b1e9..d90909ec6aa6 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include From ad917e7f821855a2f223131bb6c90ca6c9240bf3 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Thu, 20 Jun 2013 18:59:51 +0800 Subject: [PATCH 08/10] dlm: config: using strlcpy instead of strncpy for NUL terminated string, need alway set '\0' in the end. Signed-off-by: Zhao Hongjiang Signed-off-by: David Teigland --- fs/dlm/config.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 7d58d5b112b5..76feb4b60fa6 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -138,8 +138,9 @@ static ssize_t cluster_cluster_name_read(struct dlm_cluster *cl, char *buf) static ssize_t cluster_cluster_name_write(struct dlm_cluster *cl, const char *buf, size_t len) { - strncpy(dlm_config.ci_cluster_name, buf, DLM_LOCKSPACE_LEN); - strncpy(cl->cl_cluster_name, buf, DLM_LOCKSPACE_LEN); + strlcpy(dlm_config.ci_cluster_name, buf, + sizeof(dlm_config.ci_cluster_name)); + strlcpy(cl->cl_cluster_name, buf, sizeof(cl->cl_cluster_name)); return len; } From 696b3d84605e5546cbddefdc95b9099f908fd56e Mon Sep 17 00:00:00 2001 From: David Teigland Date: Tue, 25 Jun 2013 12:48:01 -0500 Subject: [PATCH 09/10] dlm: log an error for unmanaged lockspaces Log an error message if the dlm user daemon exits before all the lockspaces have been removed. Signed-off-by: David Teigland --- fs/dlm/lockspace.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 3ca79d3253b9..88556dc0458e 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -883,17 +883,24 @@ int dlm_release_lockspace(void *lockspace, int force) void dlm_stop_lockspaces(void) { struct dlm_ls *ls; + int count; restart: + count = 0; spin_lock(&lslist_lock); list_for_each_entry(ls, &lslist, ls_list) { - if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) { + count++; continue; + } spin_unlock(&lslist_lock); log_error(ls, "no userland control daemon, stopping lockspace"); dlm_ls_stop(ls); goto restart; } spin_unlock(&lslist_lock); + + if (count) + log_print("dlm user daemon left %d lockspaces", count); } From cfa805f6f19639b37ee877085770a396b70f2da1 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 26 Jun 2013 14:27:57 +0200 Subject: [PATCH 10/10] dlm: Avoid LVB truncation For lockspaces with an LVB length above 64 bytes, avoid truncating the LVB while exchanging it with another node in the cluster. Signed-off-by: Bart Van Assche Signed-off-by: David Teigland --- fs/dlm/lock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 1b1146670c4b..e223a911a834 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -2038,8 +2038,8 @@ static void set_lvb_lock_pc(struct dlm_rsb *r, struct dlm_lkb *lkb, b = dlm_lvb_operations[lkb->lkb_grmode + 1][lkb->lkb_rqmode + 1]; if (b == 1) { int len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > r->res_ls->ls_lvblen) + len = r->res_ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); lkb->lkb_lvbseq = ms->m_lvbseq; } @@ -3893,8 +3893,8 @@ static int receive_lvb(struct dlm_ls *ls, struct dlm_lkb *lkb, if (!lkb->lkb_lvbptr) return -ENOMEM; len = receive_extralen(ms); - if (len > DLM_RESNAME_MAXLEN) - len = DLM_RESNAME_MAXLEN; + if (len > ls->ls_lvblen) + len = ls->ls_lvblen; memcpy(lkb->lkb_lvbptr, ms->m_extra, len); } return 0;