From b09c603ca40fc6980188f9c7f419c0918a3f461e Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 2 May 2018 10:37:48 +0800 Subject: [PATCH 1/3] dlm: fix a clerical error when set SCTP_NODELAY There is a clerical error when turn off Nagle's algorithm in sctp_connect_to_sock() function, this results in turn off Nagle's algorithm failure. After this correction, DLM performance will be improved obviously when using SCTP procotol. Signed-off-by: Gang He Signed-off-by: Michal Kubecek Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5243989a60cc..815125281a86 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1080,7 +1080,7 @@ static void sctp_connect_to_sock(struct connection *con) log_print("connecting to %d", con->nodeid); /* Turn off Nagle's algorithm */ - kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, + kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, From f706d830154b6c8405c6b77595a60d135182c97e Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 2 May 2018 10:28:35 -0500 Subject: [PATCH 2/3] dlm: make sctp_connect_to_sock() return in specified time When the user setup a two-ring cluster, DLM kernel module will automatically selects to use SCTP protocol to communicate between each node. There will be about 5 minute hang in DLM kernel module, in case one ring is broken before switching to another ring, this will potentially affect the dependent upper applications, e.g. ocfs2, gfs2, clvm and clustered-MD, etc. Unfortunately, if the user setup a two-ring cluster, we can not specify DLM communication protocol with TCP explicitly, since DLM kernel module only supports SCTP protocol for multiple ring cluster. Base on my investigation, the time is spent in sock->ops->connect() function before returns ETIMEDOUT(-110) error, since O_NONBLOCK argument in connect() function does not work here, then we should make sock->ops->connect() function return in specified time via setting socket SO_SNDTIMEO atrribute. Signed-off-by: Gang He Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 815125281a86..d31e9abfb9f1 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1037,6 +1037,7 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; + struct timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -1083,8 +1084,19 @@ static void sctp_connect_to_sock(struct connection *con) kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); + /* + * Make sock->ops->connect() function return in specified time, + * since O_NONBLOCK argument in connect() function does not work here, + * then, we should restore the default value of this attribute. + */ + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, O_NONBLOCK); + memset(&tv, 0, sizeof(tv)); + kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, + sizeof(tv)); + if (result == -EINPROGRESS) result = 0; if (result == 0) From da3627c30d229fea1e070e984366f80a1c4d9166 Mon Sep 17 00:00:00 2001 From: Gang He Date: Tue, 29 May 2018 11:09:22 +0800 Subject: [PATCH 3/3] dlm: remove O_NONBLOCK flag in sctp_connect_to_sock We should remove O_NONBLOCK flag when calling sock->ops->connect() in sctp_connect_to_sock() function. Why? 1. up to now, sctp socket connect() function ignores the flag argument, that means O_NONBLOCK flag does not take effect, then we should remove it to avoid the confusion (but is not urgent). 2. for the future, there will be a patch to fix this problem, then the flag argument will take effect, the patch has been queued at https://git.kernel.o rg/pub/scm/linux/kernel/git/davem/net.git/commit/net/sctp?id=644fbdeacf1d3ed d366e44b8ba214de9d1dd66a9. But, the O_NONBLOCK flag will make sock->ops->connect() directly return without any wait time, then the connection will not be established, DLM kernel module will call sock->ops->connect() again and again, the bad results are, CPU usage is almost 100%, even trigger soft_lockup problem if the related configurations are enabled, DLM kernel module also prints lots of messages like, [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592 [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592 [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592 [Fri Apr 27 11:23:43 2018] dlm: connecting to 172167592 The upper application (e.g. ocfs2 mount command) is hanged at new_lockspace(), the whole backtrace is as below, tb0307-nd2:~ # cat /proc/2935/stack [<0>] new_lockspace+0x957/0xac0 [dlm] [<0>] dlm_new_lockspace+0xae/0x140 [dlm] [<0>] user_cluster_connect+0xc3/0x3a0 [ocfs2_stack_user] [<0>] ocfs2_cluster_connect+0x144/0x220 [ocfs2_stackglue] [<0>] ocfs2_dlm_init+0x215/0x440 [ocfs2] [<0>] ocfs2_fill_super+0xcb0/0x1290 [ocfs2] [<0>] mount_bdev+0x173/0x1b0 [<0>] mount_fs+0x35/0x150 [<0>] vfs_kern_mount.part.23+0x54/0x100 [<0>] do_mount+0x59a/0xc40 [<0>] SyS_mount+0x80/0xd0 [<0>] do_syscall_64+0x76/0x140 [<0>] entry_SYSCALL_64_after_hwframe+0x42/0xb7 [<0>] 0xffffffffffffffff So, I think we should remove O_NONBLOCK flag here, since DLM kernel module can not handle non-block sockect in connect() properly. Signed-off-by: Gang He Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d31e9abfb9f1..a5e4a221435c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1092,7 +1092,7 @@ static void sctp_connect_to_sock(struct connection *con) kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv)); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, - O_NONBLOCK); + 0); memset(&tv, 0, sizeof(tv)); kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv));