From 96f287b0cf512ee537826943c15b0b8647472f70 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 3 Dec 2009 08:09:56 -0500 Subject: [PATCH 01/21] NFS: BKL removal from the mount code... None of the code in nfs_umount_begin() or nfs_remount() has any BKL dependency. Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 90be551b80c1..f0188eaf3726 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -714,8 +714,6 @@ static void nfs_umount_begin(struct super_block *sb) struct nfs_server *server; struct rpc_clnt *rpc; - lock_kernel(); - server = NFS_SB(sb); /* -EIO all pending I/O */ rpc = server->client_acl; @@ -724,8 +722,6 @@ static void nfs_umount_begin(struct super_block *sb) rpc = server->client; if (!IS_ERR(rpc)) rpc_killall_tasks(rpc); - - unlock_kernel(); } static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int version) @@ -1881,7 +1877,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) if (data == NULL) return -ENOMEM; - lock_kernel(); /* fill out struct with values from existing mount */ data->flags = nfss->flags; data->rsize = nfss->rsize; @@ -1907,7 +1902,6 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data) error = nfs_compare_remount_data(nfss, data); out: kfree(data); - unlock_kernel(); return error; } From d4e935bd67ca05db4119b67801d9ece6ae139f05 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Thu, 3 Dec 2009 15:58:33 -0500 Subject: [PATCH 02/21] The rpc server does not require that service threads take the BKL. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- fs/nfs/callback.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 293fa0528a6e..e66ec5d169f7 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -78,11 +78,6 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - /* - * FIXME: do we really need to run this under the BKL? If so, please - * add a comment about what it's intended to protect. - */ - lock_kernel(); while (!kthread_should_stop()) { /* * Listen for a request on the socket @@ -104,7 +99,6 @@ nfs4_callback_svc(void *vrqstp) preverr = err; svc_process(rqstp); } - unlock_kernel(); return 0; } @@ -160,11 +154,6 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - /* - * FIXME: do we really need to run this under the BKL? If so, please - * add a comment about what it's intended to protect. - */ - lock_kernel(); while (!kthread_should_stop()) { prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -183,7 +172,6 @@ nfs41_callback_svc(void *vrqstp) } finish_wait(&serv->sv_cb_waitq, &wq); } - unlock_kernel(); return 0; } From ee671b016fbfc26d69c3fe02e28706222beb1149 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 03/21] NFS: convert proto= option to use netids rather than a protoname Solaris uses netids as values for the proto= option, so that when someone specifies "tcp6" they get traffic over TCP + IPv6. Until recently, this has never really been an issue for Linux since it didn't support NFS over IPv6. The netid and the protocol name were generally always the same (modulo any strange configuration in /etc/netconfig). The solaris manpage documents their proto= option as: proto= _netid_ | rdma This patch is intended to bring Linux closer to how the Solaris proto= option works, by declaring a static netid mapping in the kernel and converting the proto= and mountproto= options to follow it and display the proper values in /proc/mounts. Much of this functionality will need to be provided by a userspace mount.nfs patch. Chuck Lever has a patch to change mount.nfs in the same way. In principle, we could do *all* of this in userspace but that would mean that the options in /proc/mounts may not match the options used by userspace. The alternative to the static mapping here is to add a mechanism to upcall to userspace for netid's. I'm not opposed to that option, but it'll probably mean more overhead (and quite a bit more code). Rather than shoot for that at first, I figured it was probably better to start simply. Comments welcome. Signed-off-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 94 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 13 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f0188eaf3726..bfad74648754 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -175,14 +175,16 @@ static const match_table_t nfs_mount_option_tokens = { }; enum { - Opt_xprt_udp, Opt_xprt_tcp, Opt_xprt_rdma, + Opt_xprt_udp, Opt_xprt_udp6, Opt_xprt_tcp, Opt_xprt_tcp6, Opt_xprt_rdma, Opt_xprt_err }; static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, + { Opt_xprt_udp6, "udp6" }, { Opt_xprt_tcp, "tcp" }, + { Opt_xprt_tcp6, "tcp6" }, { Opt_xprt_rdma, "rdma" }, { Opt_xprt_err, NULL } @@ -492,6 +494,45 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) return sec_flavours[i].str; } +static void nfs_show_mountd_netid(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *) &nfss->mountd_address; + + seq_printf(m, ",mountproto="); + switch (sap->sa_family) { + case AF_INET: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + case AF_INET6: + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, RPCBIND_NETID_UDP6); + break; + case IPPROTO_TCP: + seq_printf(m, RPCBIND_NETID_TCP6); + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } + break; + default: + if (showdefaults) + seq_printf(m, "auto"); + } +} + static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) { @@ -518,17 +559,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, if (nfss->mountd_port || showdefaults) seq_printf(m, ",mountport=%u", nfss->mountd_port); - switch (nfss->mountd_protocol) { - case IPPROTO_UDP: - seq_printf(m, ",mountproto=udp"); - break; - case IPPROTO_TCP: - seq_printf(m, ",mountproto=tcp"); - break; - default: - if (showdefaults) - seq_printf(m, ",mountproto=auto"); - } + nfs_show_mountd_netid(m, nfss, showdefaults); } /* @@ -578,7 +609,7 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, seq_puts(m, nfs_infop->nostr); } seq_printf(m, ",proto=%s", - rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); + rpc_peeraddr2str(nfss->client, RPC_DISPLAY_NETID)); if (version == 4) { if (nfss->port != NFS_PORT) seq_printf(m, ",port=%u", nfss->port); @@ -883,6 +914,8 @@ static int nfs_parse_mount_options(char *raw, { char *p, *string, *secdata; int rc, sloppy = 0, invalid_option = 0; + unsigned short protofamily = AF_UNSPEC; + unsigned short mountfamily = AF_UNSPEC; if (!raw) { dfprintk(MOUNT, "NFS: mount options string was NULL.\n"); @@ -1228,12 +1261,17 @@ static int nfs_parse_mount_options(char *raw, token = match_token(string, nfs_xprt_protocol_tokens, args); + protofamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + protofamily = AF_INET6; case Opt_xprt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; kfree(string); break; + case Opt_xprt_tcp6: + protofamily = AF_INET6; case Opt_xprt_tcp: mnt->flags |= NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_TCP; @@ -1261,10 +1299,15 @@ static int nfs_parse_mount_options(char *raw, nfs_xprt_protocol_tokens, args); kfree(string); + mountfamily = AF_INET; switch (token) { + case Opt_xprt_udp6: + mountfamily = AF_INET6; case Opt_xprt_udp: mnt->mount_server.protocol = XPRT_TRANSPORT_UDP; break; + case Opt_xprt_tcp6: + mountfamily = AF_INET6; case Opt_xprt_tcp: mnt->mount_server.protocol = XPRT_TRANSPORT_TCP; break; @@ -1363,8 +1406,33 @@ static int nfs_parse_mount_options(char *raw, if (!sloppy && invalid_option) return 0; + /* + * verify that any proto=/mountproto= options match the address + * familiies in the addr=/mountaddr= options. + */ + if (protofamily != AF_UNSPEC && + protofamily != mnt->nfs_server.address.ss_family) + goto out_proto_mismatch; + + if (mountfamily != AF_UNSPEC) { + if (mnt->mount_server.addrlen) { + if (mountfamily != mnt->mount_server.address.ss_family) + goto out_mountproto_mismatch; + } else { + if (mountfamily != mnt->nfs_server.address.ss_family) + goto out_mountproto_mismatch; + } + } + return 1; +out_mountproto_mismatch: + printk(KERN_INFO "NFS: mount server address does not match mountproto= " + "option\n"); + return 0; +out_proto_mismatch: + printk(KERN_INFO "NFS: server address does not match proto= option\n"); + return 0; out_invalid_address: printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); return 0; From a01878aac57eac6eb4bf194788ab2cc440490d0f Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 04/21] NFS: reorder nfs4_sequence_regs to remove 8 bytes of padding on 64 bits reorder nfs4_sequence_args to remove 8 bytes of padding on 64 bit builds. The size of this structure drops to 24 bytes from 32 and reduces the text size of nfs.ko. On my x86_64 size reports text data bss 2.6.32-rc5 200996 8512 432 209940 33414 nfs.ko +patch 200884 8512 432 209828 333a4 nfs.ko Signed-off-by: Richard Kennedy Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 62f63fb0c4c8..8c880372536e 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -170,8 +170,8 @@ struct nfs4_sequence_args { struct nfs4_sequence_res { struct nfs4_session *sr_session; u8 sr_slotid; /* slot used to send request */ - unsigned long sr_renewal_time; int sr_status; /* sequence operation status */ + unsigned long sr_renewal_time; }; struct nfs4_get_lease_time_args { From dd1fd90fe65e2e642f0e58e2ff4849f317a6c43d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 05/21] SUNRPC: Display compressed (shorthand) IPv6 presentation addresses Recent changes to snprintf() introduced the %pI6c formatter, which can display an IPv6 address with standard shorthanding. Using a shorthanded address can save us a few bytes of memory for each stored presentation address, or a few bytes on the wire when sending these in a universal address. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/addr.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/net/sunrpc/addr.c b/net/sunrpc/addr.c index c7450c8f0a7c..6dcdd2517819 100644 --- a/net/sunrpc/addr.c +++ b/net/sunrpc/addr.c @@ -55,16 +55,8 @@ static size_t rpc_ntop6_noscopeid(const struct sockaddr *sap, /* * RFC 4291, Section 2.2.1 - * - * To keep the result as short as possible, especially - * since we don't shorthand, we don't want leading zeros - * in each halfword, so avoid %pI6. */ - return snprintf(buf, buflen, "%x:%x:%x:%x:%x:%x:%x:%x", - ntohs(addr->s6_addr16[0]), ntohs(addr->s6_addr16[1]), - ntohs(addr->s6_addr16[2]), ntohs(addr->s6_addr16[3]), - ntohs(addr->s6_addr16[4]), ntohs(addr->s6_addr16[5]), - ntohs(addr->s6_addr16[6]), ntohs(addr->s6_addr16[7])); + return snprintf(buf, buflen, "%pI6c", addr); } static size_t rpc_ntop6(const struct sockaddr *sap, From d250e190fb9b06f4c595eade88b3d0b705fb330a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 06/21] NFS: Display compressed (shorthand) IPv6 in /proc/mounts Recent changes to snprintf() introduced the %pI6c formatter, which can display an IPv6 address with standard shorthanding. Use this new formatter when displaying IPv6 server addresses in /proc/mounts. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index bfad74648754..837032731bb6 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -546,7 +546,7 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, } case AF_INET6: { struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - seq_printf(m, ",mountaddr=%pI6", &sin6->sin6_addr); + seq_printf(m, ",mountaddr=%pI6c", &sin6->sin6_addr); break; } default: From dd47f96c077b4516727e497e4b6fd47a06778c0a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 07/21] NFS: Revert default r/wsize behavior When the "rsize=" or "wsize=" mount options are not specified, text-based mounts have slightly different behavior than legacy binary mounts. Text-based mounts use the smaller of the server's maximum and the client's maximum, but binary mounts use the smaller of the server's _preferred_ size and the client's maximum. This difference is actually pretty subtle. Most servers advertise the same value as their maximum and their preferred transfer size, so the end result is the same in most cases. The reason for this difference is that for text-based mounts, if r/wsize are not specified, they are set to the largest value supported by the client. For legacy mounts, the values are set to zero if these options are not specified. nfs_server_set_fsinfo() can negotiate the transfer size defaults correctly in any case. There's no need to specify any particular value as default in the text-based option parsing logic. Note that nfs4 doesn't use nfs_server_set_fsinfo(), but the mount.nfs4 command does set rsize and wsize to 0 if the user didn't specify these options. So, make the same change for text-based NFSv4 mounts. Thanks to James Pearson for reporting and diagnosing the problem. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 837032731bb6..ce907efc5508 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -761,8 +761,6 @@ static struct nfs_parsed_mount_data *nfs_alloc_parsed_mount_data(unsigned int ve data = kzalloc(sizeof(*data), GFP_KERNEL); if (data) { - data->rsize = NFS_MAX_FILE_IO_SIZE; - data->wsize = NFS_MAX_FILE_IO_SIZE; data->acregmin = NFS_DEF_ACREGMIN; data->acregmax = NFS_DEF_ACREGMAX; data->acdirmin = NFS_DEF_ACDIRMIN; From 206a134b4d8abf57cd34dffacf993869355b9aac Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 08/21] SUNRPC: Check explicitly for tk_status == 0 in call_transmit_status() The success case, where task->tk_status == 0, is by far the most frequent case in call_transmit_status(). The default: arm of the switch statement in call_transmit_status() handles the 0 case. default: was moved close to the top of the switch statement in call_transmit_status() under the theory that the compiler places object code for the earliest arms of a switch statement first, making the CPU do less work. The default: arm of a switch statement, however, is executed only after all the other cases have been checked. Even if the compiler rearranges the object code, the default: arm is the "last resort", meaning all of the other cases have been explicitly exhausted. That makes the current arrangement about as inefficient as it gets for the common case. To fix this, add an explicit check for zero before the switch statement. That forces the compiler to do the zero check first, no matter what optimizations it might try to do to the switch statement. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 38829e20500b..7bcd931e06ee 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1180,10 +1180,22 @@ static void call_transmit_status(struct rpc_task *task) { task->tk_action = call_status; + + /* + * Common case: success. Force the compiler to put this + * test first. + */ + if (task->tk_status == 0) { + xprt_end_transmit(task); + rpc_task_force_reencode(task); + return; + } + switch (task->tk_status) { case -EAGAIN: break; default: + dprint_status(task); xprt_end_transmit(task); /* * Special cases: if we've been waiting on the From 09a21c4102c8f7893368553273d39c0cadedf9af Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 09/21] SUNRPC: Allow RPCs to fail quickly if the server is unreachable The kernel sometimes makes RPC calls to services that aren't running. Because the kernel's RPC client always assumes the hard retry semantic when reconnecting a connection-oriented RPC transport, the underlying reconnect logic takes a long while to time out, even though the remote may have responded immediately with ECONNREFUSED. In certain cases, like upcalls to our local rpcbind daemon, or for NFS mount requests, we'd like the kernel to fail immediately if the remote service isn't reachable. This allows another transport to be tried immediately, or the pending request can be abandoned quickly. Introduce a per-request flag which controls how call_transmit_status() behaves when request transmission fails because the server cannot be reached. We don't want soft connection semantics to apply to other errors. The default case of the switch statement in call_transmit_status() no longer falls through; the fall through code is copied to the default case, and a "break;" is added. The transport's connection re-establishment timeout is also ignored for such requests. We want the request to fail immediately, so the reconnect delay is skipped. Additionally, we don't want a connect failure here to further increase the reconnect timeout value, since this request will not be retried. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/sunrpc/sched.h | 2 ++ net/sunrpc/clnt.c | 11 +++++++++-- net/sunrpc/xprtsock.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index 401097781fc0..1906782ec86b 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -130,12 +130,14 @@ struct rpc_task_setup { #define RPC_TASK_DYNAMIC 0x0080 /* task was kmalloc'ed */ #define RPC_TASK_KILLED 0x0100 /* task was killed */ #define RPC_TASK_SOFT 0x0200 /* Use soft timeouts */ +#define RPC_TASK_SOFTCONN 0x0400 /* Fail if can't connect */ #define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC) #define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER) #define RPC_DO_ROOTOVERRIDE(t) ((t)->tk_flags & RPC_TASK_ROOTCREDS) #define RPC_ASSASSINATED(t) ((t)->tk_flags & RPC_TASK_KILLED) #define RPC_IS_SOFT(t) ((t)->tk_flags & RPC_TASK_SOFT) +#define RPC_IS_SOFTCONN(t) ((t)->tk_flags & RPC_TASK_SOFTCONN) #define RPC_TASK_RUNNING 0 #define RPC_TASK_QUEUED 1 diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7bcd931e06ee..68a23583f44c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1197,6 +1197,8 @@ call_transmit_status(struct rpc_task *task) default: dprint_status(task); xprt_end_transmit(task); + rpc_task_force_reencode(task); + break; /* * Special cases: if we've been waiting on the * socket's write_space() callback, or if the @@ -1204,11 +1206,16 @@ call_transmit_status(struct rpc_task *task) * then hold onto the transport lock. */ case -ECONNREFUSED: - case -ECONNRESET: - case -ENOTCONN: case -EHOSTDOWN: case -EHOSTUNREACH: case -ENETUNREACH: + if (RPC_IS_SOFTCONN(task)) { + xprt_end_transmit(task); + rpc_exit(task, task->tk_status); + break; + } + case -ECONNRESET: + case -ENOTCONN: case -EPIPE: rpc_task_force_reencode(task); } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 37c5475ba258..ff312f8b018d 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2033,7 +2033,7 @@ static void xs_connect(struct rpc_task *task) if (xprt_test_and_set_connecting(xprt)) return; - if (transport->sock != NULL) { + if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) { dprintk("RPC: xs_connect delayed xprt %p for %lu " "seconds\n", xprt, xprt->reestablish_timeout / HZ); From 5a46211540a83871196489247f57da2bdde58d87 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 10/21] SUNRPC: Simplify synopsis of rpcb_local_clnt() Clean up: At one point, rpcb_local_clnt() handled IPv6 loopback addresses too, but it doesn't any more; only IPv4 loopback is used now. Get rid of the @addr and @addrlen arguments to rpcb_local_clnt(). Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/rpcb_clnt.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 830faf4d9997..28f50da60ceb 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -163,13 +163,12 @@ static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_port = htons(RPCBIND_PORT), }; -static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr, - size_t addrlen, u32 version) +static struct rpc_clnt *rpcb_create_local(u32 version) { struct rpc_create_args args = { .protocol = XPRT_TRANSPORT_UDP, - .address = addr, - .addrsize = addrlen, + .address = (struct sockaddr *)&rpcb_inaddr_loopback, + .addrsize = sizeof(rpcb_inaddr_loopback), .servername = "localhost", .program = &rpcb_program, .version = version, @@ -211,14 +210,12 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, static int rpcb_register_call(const u32 version, struct rpc_message *msg) { - struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback; - size_t addrlen = sizeof(rpcb_inaddr_loopback); struct rpc_clnt *rpcb_clnt; int result, error = 0; msg->rpc_resp = &result; - rpcb_clnt = rpcb_create_local(addr, addrlen, version); + rpcb_clnt = rpcb_create_local(version); if (!IS_ERR(rpcb_clnt)) { error = rpc_call_sync(rpcb_clnt, msg, 0); rpc_shutdown_client(rpcb_clnt); From c526611dd631b2802b6b0221ffb306c5fa25c86c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 11/21] SUNRPC: Use a cached RPC client and transport for rpcbind upcalls The kernel's rpcbind client creates and deletes an rpc_clnt and its underlying transport socket for every upcall to the local rpcbind daemon. When starting a typical NFS server on IPv4 and IPv6, the NFS service itself does three upcalls (one per version) times two upcalls (one per transport) times two upcalls (one per address family), making 12, plus another one for the initial call to unregister previous NFS services. Starting the NLM service adds an additional 13 upcalls, for similar reasons. (Currently the NFS service doesn't start IPv6 listeners, but it will soon enough). Instead, let's create an rpc_clnt for rpcbind upcalls during the first local rpcbind query, and cache it. This saves the overhead of creating and destroying an rpc_clnt and a socket for every upcall. The new logic also prevents the kernel from attempting an RPCB_SET or RPCB_UNSET if it knows from the start that the local portmapper does not support rpcbind protocol version 4. This will cut down on the number of rpcbind upcalls in legacy environments. Signed-off-by: Chuck Lever --- net/sunrpc/rpcb_clnt.c | 93 +++++++++++++++++++++++++++++++++------- net/sunrpc/sunrpc_syms.c | 3 ++ 2 files changed, 80 insertions(+), 16 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 28f50da60ceb..116db74dd942 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -110,6 +111,9 @@ static void rpcb_getport_done(struct rpc_task *, void *); static void rpcb_map_release(void *data); static struct rpc_program rpcb_program; +static struct rpc_clnt * rpcb_local_clnt; +static struct rpc_clnt * rpcb_local_clnt4; + struct rpcbind_args { struct rpc_xprt * r_xprt; @@ -163,7 +167,13 @@ static const struct sockaddr_in rpcb_inaddr_loopback = { .sin_port = htons(RPCBIND_PORT), }; -static struct rpc_clnt *rpcb_create_local(u32 version) +static DEFINE_MUTEX(rpcb_create_local_mutex); + +/* + * Returns zero on success, otherwise a negative errno value + * is returned. + */ +static int rpcb_create_local(void) { struct rpc_create_args args = { .protocol = XPRT_TRANSPORT_UDP, @@ -171,12 +181,46 @@ static struct rpc_clnt *rpcb_create_local(u32 version) .addrsize = sizeof(rpcb_inaddr_loopback), .servername = "localhost", .program = &rpcb_program, - .version = version, + .version = RPCBVERS_2, .authflavor = RPC_AUTH_UNIX, .flags = RPC_CLNT_CREATE_NOPING, }; + struct rpc_clnt *clnt, *clnt4; + int result = 0; - return rpc_create(&args); + if (rpcb_local_clnt) + return result; + + mutex_lock(&rpcb_create_local_mutex); + if (rpcb_local_clnt) + goto out; + + clnt = rpc_create(&args); + if (IS_ERR(clnt)) { + dprintk("RPC: failed to create local rpcbind " + "client (errno %ld).\n", PTR_ERR(clnt)); + result = -PTR_ERR(clnt); + goto out; + } + + /* + * This results in an RPC ping. On systems running portmapper, + * the v4 ping will fail. Proceed anyway, but disallow rpcb + * v4 upcalls. + */ + clnt4 = rpc_bind_new_program(clnt, &rpcb_program, RPCBVERS_4); + if (IS_ERR(clnt4)) { + dprintk("RPC: failed to create local rpcbind v4 " + "cleint (errno %ld).\n", PTR_ERR(clnt4)); + clnt4 = NULL; + } + + rpcb_local_clnt = clnt; + rpcb_local_clnt4 = clnt4; + +out: + mutex_unlock(&rpcb_create_local_mutex); + return result; } static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, @@ -208,20 +252,13 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr, return rpc_create(&args); } -static int rpcb_register_call(const u32 version, struct rpc_message *msg) +static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg) { - struct rpc_clnt *rpcb_clnt; int result, error = 0; msg->rpc_resp = &result; - rpcb_clnt = rpcb_create_local(version); - if (!IS_ERR(rpcb_clnt)) { - error = rpc_call_sync(rpcb_clnt, msg, 0); - rpc_shutdown_client(rpcb_clnt); - } else - error = PTR_ERR(rpcb_clnt); - + error = rpc_call_sync(clnt, msg, 0); if (error < 0) { dprintk("RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); @@ -276,6 +313,11 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) struct rpc_message msg = { .rpc_argp = &map, }; + int error; + + error = rpcb_create_local(); + if (error) + return error; dprintk("RPC: %sregistering (%u, %u, %d, %u) with local " "rpcbind\n", (port ? "" : "un"), @@ -285,7 +327,7 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port) if (port) msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET]; - return rpcb_register_call(RPCBVERS_2, &msg); + return rpcb_register_call(rpcb_local_clnt, &msg); } /* @@ -310,7 +352,7 @@ static int rpcb_register_inet4(const struct sockaddr *sap, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - result = rpcb_register_call(RPCBVERS_4, msg); + result = rpcb_register_call(rpcb_local_clnt4, msg); kfree(map->r_addr); return result; } @@ -337,7 +379,7 @@ static int rpcb_register_inet6(const struct sockaddr *sap, if (port) msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET]; - result = rpcb_register_call(RPCBVERS_4, msg); + result = rpcb_register_call(rpcb_local_clnt4, msg); kfree(map->r_addr); return result; } @@ -353,7 +395,7 @@ static int rpcb_unregister_all_protofamilies(struct rpc_message *msg) map->r_addr = ""; msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET]; - return rpcb_register_call(RPCBVERS_4, msg); + return rpcb_register_call(rpcb_local_clnt4, msg); } /** @@ -411,6 +453,13 @@ int rpcb_v4_register(const u32 program, const u32 version, struct rpc_message msg = { .rpc_argp = &map, }; + int error; + + error = rpcb_create_local(); + if (error) + return error; + if (rpcb_local_clnt4 == NULL) + return -EPROTONOSUPPORT; if (address == NULL) return rpcb_unregister_all_protofamilies(&msg); @@ -1024,3 +1073,15 @@ static struct rpc_program rpcb_program = { .version = rpcb_version, .stats = &rpcb_stats, }; + +/** + * cleanup_rpcb_clnt - remove xprtsock's sysctls, unregister + * + */ +void cleanup_rpcb_clnt(void) +{ + if (rpcb_local_clnt4) + rpc_shutdown_client(rpcb_local_clnt4); + if (rpcb_local_clnt) + rpc_shutdown_client(rpcb_local_clnt); +} diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 8cce92189019..f438347d817b 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -24,6 +24,8 @@ extern struct cache_detail ip_map_cache, unix_gid_cache; +extern void cleanup_rpcb_clnt(void); + static int __init init_sunrpc(void) { @@ -53,6 +55,7 @@ out: static void __exit cleanup_sunrpc(void) { + cleanup_rpcb_clnt(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); From 2a76b3bfa22993fc09166bd6a8db0dbe902b6813 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 12/21] SUNRPC: Use TCP for local rpcbind upcalls Use TCP with the soft connect semantic for local rpcbind upcalls so the kernel can detect immediately if the local rpcbind daemon is not running. Signed-off-by: Chuck Lever --- net/sunrpc/rpcb_clnt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 116db74dd942..401154094ee1 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -176,7 +176,7 @@ static DEFINE_MUTEX(rpcb_create_local_mutex); static int rpcb_create_local(void) { struct rpc_create_args args = { - .protocol = XPRT_TRANSPORT_UDP, + .protocol = XPRT_TRANSPORT_TCP, .address = (struct sockaddr *)&rpcb_inaddr_loopback, .addrsize = sizeof(rpcb_inaddr_loopback), .servername = "localhost", @@ -258,7 +258,7 @@ static int rpcb_register_call(struct rpc_clnt *clnt, struct rpc_message *msg) msg->rpc_resp = &result; - error = rpc_call_sync(clnt, msg, 0); + error = rpc_call_sync(clnt, msg, RPC_TASK_SOFTCONN); if (error < 0) { dprintk("RPC: failed to contact local rpcbind " "server (errno %d).\n", -error); From 012da158f636347c4eb28fd1e1cca020fef5e54d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 13/21] SUNRPC: Use soft connects for autobinding over TCP Autobinding is handled by the rpciod process, not in user processes that are generating regular RPC requests. Thus autobinding is usually not affected by signals targetting user processes, such as KILL or timer expiration events. In addition, an RPC request generated by a user process that has RPC_TASK_SOFTCONN set and needs to perform an autobind will hang if the remote rpcbind service is not available. For rpcbind queries on connection-oriented transports, let's use the new soft connect semantic to return control to the user's process quickly, if the kernel's rpcbind client can't connect to the remote rpcbind service. Logic is introduced in call_bind_status() to handle connection errors that occurred during an asynchronous rpcbind query. The logic abandons the rpcbind query if the RPC request has SOFTCONN set, and retries after a few seconds in the normal case. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 17 ++++++++++++++++- net/sunrpc/rpcb_clnt.c | 2 +- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 68a23583f44c..4b76ef9336c7 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1060,7 +1060,7 @@ call_bind_status(struct rpc_task *task) goto retry_timeout; case -EPFNOSUPPORT: /* server doesn't support any rpcbind version we know of */ - dprintk("RPC: %5u remote rpcbind service unavailable\n", + dprintk("RPC: %5u unrecognized remote rpcbind service\n", task->tk_pid); break; case -EPROTONOSUPPORT: @@ -1069,6 +1069,21 @@ call_bind_status(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_bind; return; + case -ECONNREFUSED: /* connection problems */ + case -ECONNRESET: + case -ENOTCONN: + case -EHOSTDOWN: + case -EHOSTUNREACH: + case -ENETUNREACH: + case -EPIPE: + dprintk("RPC: %5u remote rpcbind unreachable: %d\n", + task->tk_pid, task->tk_status); + if (!RPC_IS_SOFTCONN(task)) { + rpc_delay(task, 5*HZ); + goto retry_timeout; + } + status = task->tk_status; + break; default: dprintk("RPC: %5u unrecognized rpcbind error (%d)\n", task->tk_pid, -task->tk_status); diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 401154094ee1..3e3772d8eb92 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -537,7 +537,7 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi .rpc_message = &msg, .callback_ops = &rpcb_getport_ops, .callback_data = map, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_SOFTCONN, }; return rpc_run_task(&task_setup_data); From caabea8a565fb4e16f8e58e16bb9d535e591b709 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 14/21] SUNRPC: Use soft connect semantics when performing RPC ping Currently, if a remote RPC service is unreachable, an RPC ping will hang until the underlying transport connect attempt times out. A more desirable behavior might be to have the ping fail immediately so upper layers can recover appropriately. In the case of an NFS mount, for instance, this would mean the mount(2) system call could fail immediately if the server isn't listening, rather than hanging uninterruptibly for more than 3 minutes. Change rpc_ping() so that it fails immediately for connection-oriented transports. rpc_create() will then fail immediately for such transports if an RPC ping was requested. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 4b76ef9336c7..97931d9f8579 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -79,7 +79,7 @@ static void call_connect_status(struct rpc_task *task); static __be32 *rpc_encode_header(struct rpc_task *task); static __be32 *rpc_verify_header(struct rpc_task *task); -static int rpc_ping(struct rpc_clnt *clnt, int flags); +static int rpc_ping(struct rpc_clnt *clnt); static void rpc_register_client(struct rpc_clnt *clnt) { @@ -340,7 +340,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args) return clnt; if (!(args->flags & RPC_CLNT_CREATE_NOPING)) { - int err = rpc_ping(clnt, RPC_TASK_SOFT); + int err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); return ERR_PTR(err); @@ -528,7 +528,7 @@ struct rpc_clnt *rpc_bind_new_program(struct rpc_clnt *old, clnt->cl_prog = program->number; clnt->cl_vers = version->number; clnt->cl_stats = program->stats; - err = rpc_ping(clnt, RPC_TASK_SOFT); + err = rpc_ping(clnt); if (err != 0) { rpc_shutdown_client(clnt); clnt = ERR_PTR(err); @@ -1709,14 +1709,14 @@ static struct rpc_procinfo rpcproc_null = { .p_decode = rpcproc_decode_null, }; -static int rpc_ping(struct rpc_clnt *clnt, int flags) +static int rpc_ping(struct rpc_clnt *clnt) { struct rpc_message msg = { .rpc_proc = &rpcproc_null, }; int err; msg.rpc_cred = authnull_ops.lookup_cred(NULL, NULL, 0); - err = rpc_call_sync(clnt, &msg, flags); + err = rpc_call_sync(clnt, &msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN); put_rpccred(msg.rpc_cred); return err; } From 3a28becc35e5c8f1fabb707bcd8a473712653de6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 15/21] SUNRPC: soft connect semantics for UDP Introduce soft connect behavior for UDP transports. In this case, a major timeout returns ETIMEDOUT instead of EIO. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 97931d9f8579..154034b675bd 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1380,6 +1380,10 @@ call_timeout(struct rpc_task *task) dprintk("RPC: %5u call_timeout (major)\n", task->tk_pid); task->tk_timeouts++; + if (RPC_IS_SOFTCONN(task)) { + rpc_exit(task, -ETIMEDOUT); + return; + } if (RPC_IS_SOFT(task)) { if (clnt->cl_chatty) printk(KERN_NOTICE "%s: server %s not responding, timed out\n", From 9c4c761a629caa5572c1a29a8288416070d5d6b7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 16/21] NFSv4.1: Handle NFSv4.1 session errors in the lock recovery code Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 2ef4fecf3984..3004089e97b1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -877,6 +877,10 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_ case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out; default: printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n", @@ -959,6 +963,10 @@ restart: case -NFS4ERR_NO_GRACE: nfs4_state_mark_reclaim_nograce(sp->so_client, state); case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_BADSLOT: + case -NFS4ERR_BAD_HIGH_SLOT: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: goto out_err; } nfs4_put_open_state(state); From e48de5ec25b37d42292c876c1d3337766aae89bd Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 17/21] nfs: remove unnecessary check from nfs_rename() VFS already checks if both source and target are directories. Signed-off-by: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 7cb298525eef..b5fae1953e9d 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1601,13 +1601,8 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * silly-rename. If the silly-rename succeeds, the * copied dentry is hashed and becomes the new target. */ - if (!new_inode) - goto go_ahead; - if (S_ISDIR(new_inode->i_mode)) { - error = -EISDIR; - if (!S_ISDIR(old_inode->i_mode)) - goto out; - } else if (atomic_read(&new_dentry->d_count) > 2) { + if (new_inode && !S_ISDIR(new_inode->i_mode) && + atomic_read(&new_dentry->d_count) > 2) { int err; /* copy the target dentry's name */ dentry = d_alloc(new_dentry->d_parent, @@ -1627,7 +1622,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } -go_ahead: /* * ... prune child dentries and writebacks if needed. */ From 28f79a1a695e7a5b00af3b6713b449e08581ffbb Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 18/21] nfs: fix comments in nfs_rename() Comments are wrong or out of date. In particular d_drop() doesn't free the inode it just unhashes the dentry. And if target is a directory then it is not checked for being busy. Signed-off-by: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b5fae1953e9d..11d0c4cffffc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1581,7 +1581,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* * To prevent any new references to the target during the rename, - * we unhash the dentry and free the inode in advance. + * we unhash the dentry in advance. */ if (!d_unhashed(new_dentry)) { d_drop(new_dentry); @@ -1594,12 +1594,10 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, atomic_read(&new_dentry->d_count)); /* - * First check whether the target is busy ... we can't - * safely do _any_ rename if the target is in use. - * - * For files, make a copy of the dentry and then do a - * silly-rename. If the silly-rename succeeds, the - * copied dentry is hashed and becomes the new target. + * For non-directories, check whether the target is busy and if so, + * make a copy of the dentry and then do a silly-rename. If the + * silly-rename succeeds, the copied dentry is hashed and becomes + * the new target. */ if (new_inode && !S_ISDIR(new_inode->i_mode) && atomic_read(&new_dentry->d_count) > 2) { From 27226104e60964f21717e0f452cecd45c85a64c6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 19/21] nfs: dont unhash target if renaming a directory Move unhashing the target to after the check for existence and being a non-directory. If renaming a directory then the VFS already unhashes the target if it is not busy. If it's busy then acquiring more references during the rename makes no difference. Signed-off-by: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 56 +++++++++++++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 27 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 11d0c4cffffc..76b7f539d76e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1579,15 +1579,6 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *dentry = NULL, *rehash = NULL; int error = -EBUSY; - /* - * To prevent any new references to the target during the rename, - * we unhash the dentry in advance. - */ - if (!d_unhashed(new_dentry)) { - d_drop(new_dentry); - rehash = new_dentry; - } - dfprintk(VFS, "NFS: rename(%s/%s -> %s/%s, ct=%d)\n", old_dentry->d_parent->d_name.name, old_dentry->d_name.name, new_dentry->d_parent->d_name.name, new_dentry->d_name.name, @@ -1599,25 +1590,36 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, * silly-rename succeeds, the copied dentry is hashed and becomes * the new target. */ - if (new_inode && !S_ISDIR(new_inode->i_mode) && - atomic_read(&new_dentry->d_count) > 2) { - int err; - /* copy the target dentry's name */ - dentry = d_alloc(new_dentry->d_parent, - &new_dentry->d_name); - if (!dentry) - goto out; + if (new_inode && !S_ISDIR(new_inode->i_mode)) { + /* + * To prevent any new references to the target during the + * rename, we unhash the dentry in advance. + */ + if (!d_unhashed(new_dentry)) { + d_drop(new_dentry); + rehash = new_dentry; + } - /* silly-rename the existing target ... */ - err = nfs_sillyrename(new_dir, new_dentry); - if (!err) { - new_dentry = rehash = dentry; - new_inode = NULL; - /* instantiate the replacement target */ - d_instantiate(new_dentry, NULL); - } else if (atomic_read(&new_dentry->d_count) > 1) - /* dentry still busy? */ - goto out; + if (atomic_read(&new_dentry->d_count) > 2) { + int err; + + /* copy the target dentry's name */ + dentry = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!dentry) + goto out; + + /* silly-rename the existing target ... */ + err = nfs_sillyrename(new_dir, new_dentry); + if (!err) { + new_dentry = rehash = dentry; + new_inode = NULL; + /* instantiate the replacement target */ + d_instantiate(new_dentry, NULL); + } else if (atomic_read(&new_dentry->d_count) > 1) + /* dentry still busy? */ + goto out; + } } /* From 24e93025ee434a58d35e5abb283c5bcc9a13e477 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 20/21] nfs: clean up sillyrenaming in nfs_rename() The d_instantiate(new_dentry, NULL) is superfluous, the dentry is already negative. Rehashing this dummy dentry isn't needed either, d_move() works fine on an unhashed target. The re-checking for busy after a failed nfs_sillyrename() is bogus too: new_dentry->d_count < 2 would be a bug here. Signed-off-by: Miklos Szeredi Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 76b7f539d76e..2c5ace4f00a7 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1611,14 +1611,11 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, /* silly-rename the existing target ... */ err = nfs_sillyrename(new_dir, new_dentry); - if (!err) { - new_dentry = rehash = dentry; - new_inode = NULL; - /* instantiate the replacement target */ - d_instantiate(new_dentry, NULL); - } else if (atomic_read(&new_dentry->d_count) > 1) - /* dentry still busy? */ + if (err) goto out; + + new_dentry = dentry; + new_inode = NULL; } } From 44ed3556bad809797f7b06a4a88918fd8a23d6fe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 3 Dec 2009 15:58:56 -0500 Subject: [PATCH 21/21] NFS4ERR_FILE_OPEN handling in Linux/NFS NFS4ERR_FILE_OPEN is return by the server when an operation cannot be performed because the file is currently open and local (to the server) semantics prohibit the operation while the file is open. A typical case is a RENAME operation on an MS-Windows platform, which prevents rename while the file is open. While it is possible that such a condition is transitory, it is also very possible that the file will be held open for an extended period of time thus preventing the operation. The current behaviour of Linux/NFS is to retry the operation indefinitely. This is not appropriate - we do not expect a rename to take an arbitrary amount of time to complete. Rather, and error should be returned. The most obvious error code would be EBUSY, which is a legal at least for 'rename' and 'unlink', and accurately captures the reason for the error. This patch allows a few retries until about 2 seconds have elapsed, then returns EBUSY. Signed-off-by: NeilBrown Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 741a562177fc..40da0d5bc5fc 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -275,6 +275,13 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, /* FALLTHROUGH */ #endif /* !defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: + if (exception->timeout > HZ) { + /* We have retried a decent amount, time to + * fail + */ + ret = -EBUSY; + break; + } case -NFS4ERR_GRACE: case -NFS4ERR_DELAY: ret = nfs4_delay(server->client, &exception->timeout);